gitextract_uvlvpncm/

├── .dockerignore
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug-report.yml
│   │   ├── config.yml
│   │   ├── feature-request.yml
│   │   └── new-model-addition.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── autodocs.yaml
│       ├── build.yaml
│       ├── build_documentation.yaml
│       ├── build_pr_documentation.yaml
│       ├── ci_build.yaml
│       ├── client-tests.yaml
│       ├── codeql.yml
│       ├── integration_tests.yaml
│       ├── load_test.yaml
│       ├── nix_build.yaml
│       ├── nix_cache.yaml
│       ├── nix_tests.yaml
│       ├── stale.yaml
│       ├── tests.yaml
│       ├── trufflehog.yaml
│       └── upload_pr_documentation.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── .redocly.lint-ignore.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Cargo.toml
├── Dockerfile
├── Dockerfile.neuron
├── Dockerfile.nix
├── Dockerfile_amd
├── Dockerfile_gaudi
├── Dockerfile_intel
├── Dockerfile_llamacpp
├── Dockerfile_trtllm
├── LICENSE
├── Makefile
├── README.md
├── assets/
│   └── tgi_grafana.json
├── backends/
│   ├── client/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── lib.rs
│   │       ├── v2/
│   │       │   ├── client.rs
│   │       │   ├── mod.rs
│   │       │   └── sharded_client.rs
│   │       └── v3/
│   │           ├── client.rs
│   │           ├── mod.rs
│   │           └── sharded_client.rs
│   ├── gaudi/
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── examples/
│   │   │   └── docker_commands/
│   │   │       └── docker_commands.md
│   │   ├── server/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── Makefile-awq
│   │   │   ├── Makefile-eetq
│   │   │   ├── Makefile-fbgemm
│   │   │   ├── Makefile-flash-att
│   │   │   ├── Makefile-flash-att-v2
│   │   │   ├── Makefile-selective-scan
│   │   │   ├── Makefile-vllm
│   │   │   ├── README.md
│   │   │   ├── dill-0.3.7-patch.sh
│   │   │   ├── dill-0.3.8-patch.sh
│   │   │   ├── pyproject.toml
│   │   │   ├── requirements.txt
│   │   │   └── text_generation_server/
│   │   │       ├── __init__.py
│   │   │       ├── adapters/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── config.py
│   │   │       │   ├── lora.py
│   │   │       │   └── weights.py
│   │   │       ├── cache.py
│   │   │       ├── cli.py
│   │   │       ├── interceptor.py
│   │   │       ├── layers/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── attention/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── common.py
│   │   │       │   │   ├── hpu.py
│   │   │       │   │   └── kv_cache.py
│   │   │       │   ├── awq/
│   │   │       │   │   ├── conversion_utils.py
│   │   │       │   │   └── quantize/
│   │   │       │   │       ├── __init__.py
│   │   │       │   │       └── hpu.py
│   │   │       │   ├── bnb.py
│   │   │       │   ├── compressed_tensors/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── loader.py
│   │   │       │   │   └── w8an_fp.py
│   │   │       │   ├── conv.py
│   │   │       │   ├── exl2.py
│   │   │       │   ├── fp8.py
│   │   │       │   ├── gptq/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── hpu.py
│   │   │       │   │   ├── quantize.py
│   │   │       │   │   └── utils.py
│   │   │       │   ├── layernorm.py
│   │   │       │   ├── linear.py
│   │   │       │   ├── lora.py
│   │   │       │   ├── medusa.py
│   │   │       │   ├── mlp.py
│   │   │       │   ├── moe/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── fp8.py
│   │   │       │   │   ├── fused_moe.py
│   │   │       │   │   └── unquantized.py
│   │   │       │   ├── rotary.py
│   │   │       │   ├── speculative.py
│   │   │       │   └── tensor_parallel.py
│   │   │       ├── models/
│   │   │       │   ├── __init__.py
│   │   │       │   ├── custom_modeling/
│   │   │       │   │   ├── __init__.py
│   │   │       │   │   ├── bloom_modeling.py
│   │   │       │   │   ├── clip.py
│   │   │       │   │   ├── flash_cohere_modeling.py
│   │   │       │   │   ├── flash_dbrx_modeling.py
│   │   │       │   │   ├── flash_deepseek_v2_modeling.py
│   │   │       │   │   ├── flash_deepseek_v3_modeling.py
│   │   │       │   │   ├── flash_gemma2_modeling.py
│   │   │       │   │   ├── flash_gemma3_modeling.py
│   │   │       │   │   ├── flash_gemma_modeling.py
│   │   │       │   │   ├── flash_gpt2_modeling.py
│   │   │       │   │   ├── flash_gptj_modeling.py
│   │   │       │   │   ├── flash_llama4_modeling.py
│   │   │       │   │   ├── flash_llama_modeling.py
│   │   │       │   │   ├── flash_llava_next.py
│   │   │       │   │   ├── flash_mistral_modeling.py
│   │   │       │   │   ├── flash_mixtral_modeling.py
│   │   │       │   │   ├── flash_mllama.py
│   │   │       │   │   ├── flash_neox_modeling.py
│   │   │       │   │   ├── flash_pali_gemma_modeling.py
│   │   │       │   │   ├── flash_phi_modeling.py
│   │   │       │   │   ├── flash_phi_moe_modeling.py
│   │   │       │   │   ├── flash_qwen2_modeling.py
│   │   │       │   │   ├── flash_qwen3_modeling.py
│   │   │       │   │   ├── flash_qwen3_moe_modeling.py
│   │   │       │   │   ├── flash_rw_modeling.py
│   │   │       │   │   ├── flash_santacoder_modeling.py
│   │   │       │   │   ├── flash_starcoder2_modeling.py
│   │   │       │   │   ├── idefics2.py
│   │   │       │   │   ├── idefics3.py
│   │   │       │   │   ├── mamba_modeling.py
│   │   │       │   │   ├── qwen2_5_vl.py
│   │   │       │   │   ├── qwen2_vl.py
│   │   │       │   │   ├── siglip.py
│   │   │       │   │   └── vlm.py
│   │   │       │   ├── flash_causal_lm.py
│   │   │       │   ├── flash_vlm_causal_lm.py
│   │   │       │   ├── globals.py
│   │   │       │   ├── mllama_causal_lm.py
│   │   │       │   ├── model.py
│   │   │       │   ├── seq2seq_lm.py
│   │   │       │   └── types.py
│   │   │       ├── pb/
│   │   │       │   └── .gitignore
│   │   │       ├── server.py
│   │   │       ├── tracing.py
│   │   │       └── utils/
│   │   │           ├── __init__.py
│   │   │           ├── adapter.py
│   │   │           ├── chunks.py
│   │   │           ├── convert.py
│   │   │           ├── debug.py
│   │   │           ├── dist.py
│   │   │           ├── hub.py
│   │   │           ├── import_utils.py
│   │   │           ├── kernels.py
│   │   │           ├── log.py
│   │   │           ├── logits_process.py
│   │   │           ├── merges/
│   │   │           │   ├── strategies.py
│   │   │           │   └── utils.py
│   │   │           ├── peft.py
│   │   │           ├── prefill_chunking.py
│   │   │           ├── quantization.py
│   │   │           ├── segments.py
│   │   │           ├── sgmv.py
│   │   │           ├── speculate.py
│   │   │           ├── tokens.py
│   │   │           ├── version.py
│   │   │           ├── watermark.py
│   │   │           └── weights.py
│   │   └── tgi-entrypoint.sh
│   ├── grpc-metadata/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── llamacpp/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build.rs
│   │   ├── requirements.txt
│   │   └── src/
│   │       ├── backend.rs
│   │       ├── llamacpp.rs
│   │       ├── main.rs
│   │       └── quantize.rs
│   ├── neuron/
│   │   ├── Cargo.toml
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── server/
│   │   │   ├── .gitignore
│   │   │   ├── Makefile
│   │   │   ├── build-requirements.txt
│   │   │   ├── pyproject.toml
│   │   │   └── text_generation_server/
│   │   │       ├── cli.py
│   │   │       ├── generator.py
│   │   │       ├── interceptor.py
│   │   │       ├── model.py
│   │   │       ├── server.py
│   │   │       └── tgi_env.py
│   │   ├── tests/
│   │   │   ├── conftest.py
│   │   │   ├── fixtures/
│   │   │   │   └── model.py
│   │   │   ├── prune_test_models.py
│   │   │   ├── pytest.ini
│   │   │   ├── requirements.txt
│   │   │   ├── server/
│   │   │   │   ├── helpers.py
│   │   │   │   ├── test_cached_model.py
│   │   │   │   ├── test_continuous_batching.py
│   │   │   │   ├── test_decode.py
│   │   │   │   ├── test_generator_slot.py
│   │   │   │   ├── test_info.py
│   │   │   │   └── test_prefill.py
│   │   │   └── test_entry_point.py
│   │   ├── tgi-entrypoint.sh
│   │   └── tgi_entry_point.py
│   ├── trtllm/
│   │   ├── CMakeLists.txt
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── build.rs
│   │   ├── cmake/
│   │   │   ├── json.cmake
│   │   │   ├── spdlog.cmake
│   │   │   ├── trtllm.cmake
│   │   │   └── utils/
│   │   │       └── detect_cuda_arch.cu
│   │   ├── csrc/
│   │   │   ├── backend.cpp
│   │   │   ├── backend.hpp
│   │   │   ├── ffi.hpp
│   │   │   └── hardware.hpp
│   │   ├── scripts/
│   │   │   ├── install_tensorrt.sh
│   │   │   └── setup_sccache.py
│   │   ├── src/
│   │   │   ├── errors.rs
│   │   │   ├── lib.rs
│   │   │   ├── looper.rs
│   │   │   ├── main.rs
│   │   │   └── utils.rs
│   │   └── tests/
│   │       ├── test_backend.cpp
│   │       └── test_hardware.cpp
│   ├── v2/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── backend.rs
│   │       ├── client/
│   │       │   ├── grpc_client.rs
│   │       │   ├── mod.rs
│   │       │   └── sharded_client.rs
│   │       ├── lib.rs
│   │       ├── main.rs
│   │       └── queue.rs
│   └── v3/
│       ├── Cargo.toml
│       ├── benches/
│       │   └── prefix_cache.rs
│       ├── build.rs
│       └── src/
│           ├── backend.rs
│           ├── block_allocator.rs
│           ├── client/
│           │   ├── grpc_client.rs
│           │   ├── mod.rs
│           │   └── sharded_client.rs
│           ├── lib.rs
│           ├── main.rs
│           ├── queue.rs
│           └── radix.rs
├── benchmark/
│   ├── Cargo.toml
│   ├── README.md
│   └── src/
│       ├── app.rs
│       ├── event.rs
│       ├── generation.rs
│       ├── lib.rs
│       ├── main.rs
│       ├── table.rs
│       └── utils.rs
├── clients/
│   └── python/
│       ├── .gitignore
│       ├── Makefile
│       ├── README.md
│       ├── pyproject.toml
│       ├── tests/
│       │   ├── conftest.py
│       │   ├── test_client.py
│       │   ├── test_errors.py
│       │   ├── test_inference_api.py
│       │   └── test_types.py
│       └── text_generation/
│           ├── __init__.py
│           ├── client.py
│           ├── errors.py
│           ├── inference_api.py
│           └── types.py
├── crate-hashes.json
├── docs/
│   ├── README.md
│   ├── index.html
│   ├── openapi.json
│   └── source/
│       ├── _toctree.yml
│       ├── architecture.md
│       ├── backends/
│       │   ├── gaudi.mdx
│       │   ├── llamacpp.md
│       │   ├── neuron.md
│       │   └── trtllm.md
│       ├── basic_tutorials/
│       │   ├── consuming_tgi.md
│       │   ├── gated_model_access.md
│       │   ├── monitoring.md
│       │   ├── non_core_models.md
│       │   ├── preparing_model.md
│       │   ├── safety.md
│       │   ├── train_medusa.md
│       │   ├── using_cli.md
│       │   ├── using_guidance.md
│       │   └── visual_language_models.md
│       ├── conceptual/
│       │   ├── chunking.md
│       │   ├── external.md
│       │   ├── flash_attention.md
│       │   ├── guidance.md
│       │   ├── lora.md
│       │   ├── paged_attention.md
│       │   ├── quantization.md
│       │   ├── safetensors.md
│       │   ├── speculation.md
│       │   ├── streaming.md
│       │   └── tensor_parallelism.md
│       ├── index.md
│       ├── installation.md
│       ├── installation_amd.md
│       ├── installation_gaudi.md
│       ├── installation_inferentia.md
│       ├── installation_intel.md
│       ├── installation_nvidia.md
│       ├── installation_tpu.md
│       ├── multi_backend_support.md
│       ├── quicktour.md
│       ├── reference/
│       │   ├── api_reference.md
│       │   ├── launcher.md
│       │   └── metrics.md
│       ├── supported_models.md
│       └── usage_statistics.md
├── flake.nix
├── integration-tests/
│   ├── conftest.py
│   ├── fixtures/
│   │   ├── gaudi/
│   │   │   └── service.py
│   │   └── neuron/
│   │       ├── export_models.py
│   │       └── service.py
│   ├── gaudi/
│   │   ├── capture_expected_outputs.py
│   │   └── test_gaudi_generate.py
│   ├── models/
│   │   ├── __snapshots__/
│   │   │   ├── test.py
│   │   │   ├── test_bloom_560m/
│   │   │   │   ├── test_bloom_560m.json
│   │   │   │   ├── test_bloom_560m_all_params.json
│   │   │   │   └── test_bloom_560m_load.json
│   │   │   ├── test_bloom_560m_sharded/
│   │   │   │   ├── test_bloom_560m_sharded.json
│   │   │   │   └── test_bloom_560m_sharded_load.json
│   │   │   ├── test_chat_llama/
│   │   │   │   └── test_flash_llama_simple.json
│   │   │   ├── test_completion_prompts/
│   │   │   │   ├── test_chat_hfhub_nousage.json
│   │   │   │   ├── test_chat_hfhub_usage.json
│   │   │   │   ├── test_chat_openai_nousage.json
│   │   │   │   ├── test_chat_openai_usage.json
│   │   │   │   ├── test_flash_llama_completion_many_prompts.json
│   │   │   │   ├── test_flash_llama_completion_many_prompts_stream.json
│   │   │   │   ├── test_flash_llama_completion_single_prompt.json
│   │   │   │   └── test_flash_llama_completion_stream_usage.json
│   │   │   ├── test_compressed_tensors_w8a8_int/
│   │   │   │   ├── test_compressed_tensors_w8a8_int.json
│   │   │   │   ├── test_compressed_tensors_w8a8_int_all_params.json
│   │   │   │   └── test_compressed_tensors_w8a8_int_load.json
│   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight/
│   │   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight.json
│   │   │   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight_all_params.json
│   │   │   │   └── test_compressed_tensors_w8a8_int_dynamic_weight_load.json
│   │   │   ├── test_compressed_tensors_w8an_fp/
│   │   │   │   ├── test_compressed_tensors_w8an.json
│   │   │   │   ├── test_compressed_tensors_w8an_all_params.json
│   │   │   │   └── test_compressed_tensors_w8an_load.json
│   │   │   ├── test_compressed_tensors_wna16_int/
│   │   │   │   ├── test_compressed_tensors_wna16.json
│   │   │   │   ├── test_compressed_tensors_wna16_all_params.json
│   │   │   │   └── test_compressed_tensors_wna16_load.json
│   │   │   ├── test_compressed_tensors_wna16_int_24/
│   │   │   │   ├── test_compressed_tensors_wna16_int_24.json
│   │   │   │   ├── test_compressed_tensors_wna16_int_24_all_params.json
│   │   │   │   └── test_compressed_tensors_wna16_int_24_load.json
│   │   │   ├── test_continue_final_message/
│   │   │   │   ├── test_llama_completion_single_prompt.json
│   │   │   │   └── test_llama_completion_single_prompt_continue.json
│   │   │   ├── test_flash_awq/
│   │   │   │   ├── test_flash_llama_awq.json
│   │   │   │   ├── test_flash_llama_awq_all_params.json
│   │   │   │   └── test_flash_llama_awq_load.json
│   │   │   ├── test_flash_awq_sharded/
│   │   │   │   ├── test_flash_llama_awq_load_sharded.json
│   │   │   │   └── test_flash_llama_awq_sharded.json
│   │   │   ├── test_flash_deepseek_v2/
│   │   │   │   ├── test_flash_deepseek_v2.json
│   │   │   │   ├── test_flash_deepseek_v2_all_params.json
│   │   │   │   └── test_flash_deepseek_v2_load.json
│   │   │   ├── test_flash_falcon/
│   │   │   │   ├── test_flash_falcon.json
│   │   │   │   ├── test_flash_falcon_all_params.json
│   │   │   │   └── test_flash_falcon_load.json
│   │   │   ├── test_flash_gemma/
│   │   │   │   ├── test_flash_gemma_all_params.json
│   │   │   │   ├── test_flash_gemma_load.json
│   │   │   │   └── test_flash_gemma_simple.json
│   │   │   ├── test_flash_gemma2/
│   │   │   │   ├── test_flash_gemma2.json
│   │   │   │   └── test_flash_gemma2_load.json
│   │   │   ├── test_flash_gemma3/
│   │   │   │   ├── test_exceed_window.json
│   │   │   │   ├── test_flash_gemma3.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgb_jpg.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgb_png.json
│   │   │   │   ├── test_flash_gemma3_image_base64_rgba.json
│   │   │   │   ├── test_flash_gemma3_image_cow.json
│   │   │   │   └── test_flash_gemma3_image_cow_dog.json
│   │   │   ├── test_flash_gemma_gptq/
│   │   │   │   ├── test_flash_gemma_gptq.json
│   │   │   │   ├── test_flash_gemma_gptq_all_params.json
│   │   │   │   └── test_flash_gemma_gptq_load.json
│   │   │   ├── test_flash_gpt2/
│   │   │   │   ├── test_flash_gpt2.json
│   │   │   │   └── test_flash_gpt2_load.json
│   │   │   ├── test_flash_grammar_llama/
│   │   │   │   ├── test_flash_llama_grammar.json
│   │   │   │   ├── test_flash_llama_grammar_json.json
│   │   │   │   ├── test_flash_llama_grammar_load.json
│   │   │   │   ├── test_flash_llama_grammar_regex.json
│   │   │   │   └── test_flash_llama_grammar_single_load_instance.json
│   │   │   ├── test_flash_llama/
│   │   │   │   ├── test_flash_llama_all_params.json
│   │   │   │   ├── test_flash_llama_load.json
│   │   │   │   └── test_flash_llama_simple.json
│   │   │   ├── test_flash_llama_exl2/
│   │   │   │   ├── test_flash_llama_exl2.json
│   │   │   │   ├── test_flash_llama_exl2_all_params.json
│   │   │   │   └── test_flash_llama_exl2_load.json
│   │   │   ├── test_flash_llama_fp8/
│   │   │   │   ├── test_flash_llama_fp8.json
│   │   │   │   ├── test_flash_llama_fp8_all_params.json
│   │   │   │   └── test_flash_llama_fp8_load.json
│   │   │   ├── test_flash_llama_fp8_kv_cache/
│   │   │   │   ├── test_flash_llama_fp8_kv_cache.json
│   │   │   │   ├── test_flash_llama_fp8_kv_cache_all_params.json
│   │   │   │   └── test_flash_llama_fp8_kv_cache_load.json
│   │   │   ├── test_flash_llama_gptq/
│   │   │   │   ├── test_flash_llama_gptq.json
│   │   │   │   ├── test_flash_llama_gptq_all_params.json
│   │   │   │   └── test_flash_llama_gptq_load.json
│   │   │   ├── test_flash_llama_marlin/
│   │   │   │   ├── test_flash_llama_marlin.json
│   │   │   │   ├── test_flash_llama_marlin_all_params.json
│   │   │   │   └── test_flash_llama_marlin_load.json
│   │   │   ├── test_flash_llama_marlin_24/
│   │   │   │   ├── test_flash_llama_marlin.json
│   │   │   │   ├── test_flash_llama_marlin24_all_params.json
│   │   │   │   └── test_flash_llama_marlin24_load.json
│   │   │   ├── test_flash_llama_prefix/
│   │   │   │   └── test_flash_llama_load.json
│   │   │   ├── test_flash_llama_prefix_flashdecoding/
│   │   │   │   └── test_flash_llama_flashdecoding.json
│   │   │   ├── test_flash_medusa/
│   │   │   │   ├── test_flash_medusa_all_params.json
│   │   │   │   ├── test_flash_medusa_load.json
│   │   │   │   └── test_flash_medusa_simple.json
│   │   │   ├── test_flash_mistral/
│   │   │   │   ├── test_flash_mistral.json
│   │   │   │   ├── test_flash_mistral_all_params.json
│   │   │   │   └── test_flash_mistral_load.json
│   │   │   ├── test_flash_mixtral/
│   │   │   │   ├── test_flash_mixtral.json
│   │   │   │   ├── test_flash_mixtral_all_params.json
│   │   │   │   └── test_flash_mixtral_load.json
│   │   │   ├── test_flash_mixtral_awq/
│   │   │   │   ├── test_flash_mixtral_awq.json
│   │   │   │   ├── test_flash_mixtral_awq_all_params.json
│   │   │   │   └── test_flash_mixtral_awq_load.json
│   │   │   ├── test_flash_mixtral_gptq/
│   │   │   │   ├── test_flash_mixtral_gptq.json
│   │   │   │   ├── test_flash_mixtral_gptq_all_params.json
│   │   │   │   └── test_flash_mixtral_gptq_load.json
│   │   │   ├── test_flash_neox/
│   │   │   │   ├── test_flash_neox.json
│   │   │   │   └── test_flash_neox_load.json
│   │   │   ├── test_flash_neox_sharded/
│   │   │   │   ├── test_flash_neox.json
│   │   │   │   └── test_flash_neox_load.json
│   │   │   ├── test_flash_pali_gemma/
│   │   │   │   ├── test_flash_pali_gemma.json
│   │   │   │   └── test_flash_pali_gemma_two_images.json
│   │   │   ├── test_flash_pali_gemma2/
│   │   │   │   └── test_flash_pali_gemma_image.json
│   │   │   ├── test_flash_phi/
│   │   │   │   ├── test_flash_phi.json
│   │   │   │   ├── test_flash_phi_all_params.json
│   │   │   │   └── test_flash_phi_load.json
│   │   │   ├── test_flash_phi35_moe/
│   │   │   │   ├── test_flash_phi35_moe.json
│   │   │   │   ├── test_flash_phi35_moe_all_params.json
│   │   │   │   └── test_flash_phi35_moe_load.json
│   │   │   ├── test_flash_qwen2/
│   │   │   │   ├── test_flash_qwen2.json
│   │   │   │   ├── test_flash_qwen2_all_params.json
│   │   │   │   └── test_flash_qwen2_load.json
│   │   │   ├── test_flash_qwen2_5_vl/
│   │   │   │   ├── test_flash_qwen2_5_vl_bay.json
│   │   │   │   ├── test_flash_qwen2_5_vl_inpaint.json
│   │   │   │   ├── test_flash_qwen2_5_vl_simple.json
│   │   │   │   └── test_flash_qwen2_5_vl_simple_streaming.json
│   │   │   ├── test_flash_qwen2_vl/
│   │   │   │   ├── test_flash_qwen2_vl_bay.json
│   │   │   │   ├── test_flash_qwen2_vl_inpaint.json
│   │   │   │   ├── test_flash_qwen2_vl_simple.json
│   │   │   │   └── test_flash_qwen2_vl_simple_streaming.json
│   │   │   ├── test_flash_santacoder/
│   │   │   │   ├── test_flash_santacoder.json
│   │   │   │   └── test_flash_santacoder_load.json
│   │   │   ├── test_flash_starcoder/
│   │   │   │   ├── test_flash_starcoder.json
│   │   │   │   ├── test_flash_starcoder_default_params.json
│   │   │   │   └── test_flash_starcoder_load.json
│   │   │   ├── test_flash_starcoder2/
│   │   │   │   ├── test_flash_starcoder2.json
│   │   │   │   ├── test_flash_starcoder2_default_params.json
│   │   │   │   └── test_flash_starcoder2_load.json
│   │   │   ├── test_flash_starcoder2_lora/
│   │   │   │   ├── test_flash_starcoder2.json
│   │   │   │   ├── test_flash_starcoder2_default_params.json
│   │   │   │   ├── test_flash_starcoder2_load.json
│   │   │   │   └── test_flash_starcoder2_with_hugcode_adapter.json
│   │   │   ├── test_flash_starcoder_gptq/
│   │   │   │   ├── test_flash_starcoder_gptq.json
│   │   │   │   ├── test_flash_starcoder_gptq_default_params.json
│   │   │   │   └── test_flash_starcoder_gptq_load.json
│   │   │   ├── test_grammar_llama/
│   │   │   │   └── test_non_flash_llama_grammar_json.json
│   │   │   ├── test_grammar_response_format_llama/
│   │   │   │   ├── test_grammar_response_format_llama_json.1.json
│   │   │   │   ├── test_grammar_response_format_llama_json.2.json
│   │   │   │   └── test_grammar_response_format_llama_json.json
│   │   │   ├── test_idefics/
│   │   │   │   ├── test_idefics.json
│   │   │   │   ├── test_idefics_load.json
│   │   │   │   └── test_idefics_two_images.json
│   │   │   ├── test_idefics2/
│   │   │   │   ├── test_flash_idefics2_next_all_params.json
│   │   │   │   ├── test_flash_idefics2_next_load.json
│   │   │   │   ├── test_flash_idefics2_next_simple.json
│   │   │   │   └── test_flash_idefics2_two_images.json
│   │   │   ├── test_idefics3/
│   │   │   │   └── test_flash_idefics3_next_simple_url.json
│   │   │   ├── test_json_schema_constrain/
│   │   │   │   ├── test_json_schema_basic.json
│   │   │   │   ├── test_json_schema_complex.json
│   │   │   │   └── test_json_schema_stream.json
│   │   │   ├── test_llava_next/
│   │   │   │   ├── test_flash_llava_next_all_params.json
│   │   │   │   ├── test_flash_llava_next_load.json
│   │   │   │   └── test_flash_llava_next_simple.json
│   │   │   ├── test_lora_mistral/
│   │   │   │   ├── test_lora_mistral_with_customer_support_adapter.json
│   │   │   │   ├── test_lora_mistral_with_dbpedia_adapter.json
│   │   │   │   ├── test_lora_mistral_without_adapter.json
│   │   │   │   └── test_lora_mistral_without_customer_support_adapter.json
│   │   │   ├── test_mamba/
│   │   │   │   ├── test_mamba.json
│   │   │   │   ├── test_mamba_all_params.json
│   │   │   │   └── test_mamba_load.json
│   │   │   ├── test_mllama/
│   │   │   │   ├── test_mllama_load.json
│   │   │   │   └── test_mllama_simpl.json
│   │   │   ├── test_mpt/
│   │   │   │   ├── test_mpt.json
│   │   │   │   └── test_mpt_load.json
│   │   │   ├── test_mt0_base/
│   │   │   │   ├── test_mt0_base.json
│   │   │   │   ├── test_mt0_base_all_params.json
│   │   │   │   └── test_mt0_base_load.json
│   │   │   ├── test_neox/
│   │   │   │   ├── test_neox.json
│   │   │   │   └── test_neox_load.json
│   │   │   ├── test_neox_sharded/
│   │   │   │   ├── test_neox.json
│   │   │   │   └── test_neox_load.json
│   │   │   ├── test_server_gptq_quantized/
│   │   │   │   ├── test_server_gptq_quantized.json
│   │   │   │   ├── test_server_gptq_quantized_all_params.json
│   │   │   │   └── test_server_gptq_quantized_load.json
│   │   │   ├── test_smolvlm/
│   │   │   │   └── test_flash_smolvlm_next_simple_url.json
│   │   │   ├── test_t5_sharded/
│   │   │   │   ├── test_t5_sharded.json
│   │   │   │   └── test_t5_sharded_load.json
│   │   │   ├── test_tools_llama/
│   │   │   │   ├── test_flash_llama_grammar_tools_auto_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_choice_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_choice_stream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_insufficient_information_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_insufficient_information_stream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_nostream.json
│   │   │   │   ├── test_flash_llama_grammar_tools_openai.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_auto.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_function_object.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_none.json
│   │   │   │   ├── test_flash_llama_grammar_tools_sea_creatures_stream_required.json
│   │   │   │   └── test_flash_llama_tool_reply_response.json
│   │   │   ├── test_transformers_llama4/
│   │   │   │   ├── test_flash_llama4.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgb_jpg.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgb_png.json
│   │   │   │   ├── test_flash_llama4_image_base64_rgba.json
│   │   │   │   ├── test_flash_llama4_image_cow.json
│   │   │   │   └── test_flash_llama4_image_cow_dog.json
│   │   │   └── test_transformers_olmo/
│   │   │       ├── test_flash_llama_load.json
│   │   │       └── test_flash_llama_simple.json
│   │   ├── test_bloom_560m.py
│   │   ├── test_bloom_560m_sharded.py
│   │   ├── test_chat_llama.py
│   │   ├── test_chat_stream_options.py
│   │   ├── test_completion_prompts.py
│   │   ├── test_compressed_tensors_w8a8_int.py
│   │   ├── test_compressed_tensors_w8a8_int_dynamic_weight.py
│   │   ├── test_compressed_tensors_w8an_fp.py
│   │   ├── test_compressed_tensors_wna16_int.py
│   │   ├── test_compressed_tensors_wna16_int_24.py
│   │   ├── test_continue_final_message.py
│   │   ├── test_flash_awq.py
│   │   ├── test_flash_awq_sharded.py
│   │   ├── test_flash_deepseek_v2.py
│   │   ├── test_flash_falcon.py
│   │   ├── test_flash_gemma.py
│   │   ├── test_flash_gemma2.py
│   │   ├── test_flash_gemma3.py
│   │   ├── test_flash_gemma_gptq.py
│   │   ├── test_flash_gpt2.py
│   │   ├── test_flash_grammar_llama.py
│   │   ├── test_flash_llama.py
│   │   ├── test_flash_llama_exl2.py
│   │   ├── test_flash_llama_fp8.py
│   │   ├── test_flash_llama_fp8_kv_cache.py
│   │   ├── test_flash_llama_gptq.py
│   │   ├── test_flash_llama_marlin.py
│   │   ├── test_flash_llama_marlin_24.py
│   │   ├── test_flash_llama_prefix.py
│   │   ├── test_flash_llama_prefix_flashdecoding.py
│   │   ├── test_flash_medusa.py
│   │   ├── test_flash_mistral.py
│   │   ├── test_flash_mixtral.py
│   │   ├── test_flash_mixtral_awq.py
│   │   ├── test_flash_mixtral_gptq.py
│   │   ├── test_flash_neox.py
│   │   ├── test_flash_neox_sharded.py
│   │   ├── test_flash_pali_gemma.py
│   │   ├── test_flash_pali_gemma2.py
│   │   ├── test_flash_phi.py
│   │   ├── test_flash_phi35_moe.py
│   │   ├── test_flash_qwen2.py
│   │   ├── test_flash_qwen2_5_vl.py
│   │   ├── test_flash_qwen2_vl.py
│   │   ├── test_flash_santacoder.py
│   │   ├── test_flash_starcoder.py
│   │   ├── test_flash_starcoder2.py
│   │   ├── test_flash_starcoder2_lora.py
│   │   ├── test_flash_starcoder_gptq.py
│   │   ├── test_grammar_llama.py
│   │   ├── test_grammar_response_format_llama.py
│   │   ├── test_idefics.py
│   │   ├── test_idefics2.py
│   │   ├── test_idefics3.py
│   │   ├── test_json_schema_constrain.py
│   │   ├── test_llava_next.py
│   │   ├── test_lora_mistral.py
│   │   ├── test_mamba.py
│   │   ├── test_mllama.py
│   │   ├── test_mpt.py
│   │   ├── test_mt0_base.py
│   │   ├── test_neox.py
│   │   ├── test_neox_sharded.py
│   │   ├── test_opt.py
│   │   ├── test_smolvlm.py
│   │   ├── test_t5_sharded.py
│   │   ├── test_tools_llama.py
│   │   ├── test_transformers_llama4.py
│   │   └── test_transformers_olmo.py
│   ├── neuron/
│   │   ├── test_generate.py
│   │   └── test_implicit_env.py
│   ├── pyproject.toml
│   ├── pytest.ini
│   └── requirements.txt
├── launcher/
│   ├── Cargo.toml
│   ├── build.rs
│   └── src/
│       ├── env_runtime.rs
│       ├── gpu.rs
│       └── main.rs
├── load_tests/
│   ├── Makefile
│   ├── benchmarks.py
│   ├── common.js
│   ├── filter.py
│   ├── long.js
│   ├── long.py
│   ├── long_prompt2.py
│   ├── orca.py
│   └── pyproject.toml
├── nix/
│   ├── client.nix
│   ├── crate-overrides.nix
│   ├── docker.nix
│   ├── impure-shell.nix
│   ├── overlay.nix
│   └── server.nix
├── proto/
│   ├── generate.proto
│   └── v3/
│       └── generate.proto
├── router/
│   ├── Cargo.toml
│   ├── README.md
│   ├── build.rs
│   └── src/
│       ├── chat.rs
│       ├── config.rs
│       ├── infer/
│       │   ├── chat_template.rs
│       │   ├── mod.rs
│       │   └── tool_grammar.rs
│       ├── kserve.rs
│       ├── lib.rs
│       ├── logging.rs
│       ├── sagemaker.rs
│       ├── server.rs
│       ├── usage_stats.rs
│       ├── validation.rs
│       └── vertex.rs
├── rust-toolchain.toml
├── sagemaker-entrypoint.sh
├── server/
│   ├── .gitignore
│   ├── Makefile
│   ├── Makefile-awq
│   ├── Makefile-eetq
│   ├── Makefile-exllamav2
│   ├── Makefile-flash-att
│   ├── Makefile-flash-att-v2
│   ├── Makefile-flashinfer
│   ├── Makefile-selective-scan
│   ├── Makefile-vllm
│   ├── README.md
│   ├── bounds-from-nix.py
│   ├── custom_kernels/
│   │   ├── custom_kernels/
│   │   │   ├── fused_attention_cuda.cu
│   │   │   └── fused_bloom_attention_cuda.cu
│   │   └── setup.py
│   ├── exllama_kernels/
│   │   ├── exllama_kernels/
│   │   │   ├── cu_compat.cuh
│   │   │   ├── cuda_buffers.cu
│   │   │   ├── cuda_buffers.cuh
│   │   │   ├── cuda_func/
│   │   │   │   ├── column_remap.cu
│   │   │   │   ├── column_remap.cuh
│   │   │   │   ├── q4_matmul.cu
│   │   │   │   ├── q4_matmul.cuh
│   │   │   │   ├── q4_matrix.cu
│   │   │   │   └── q4_matrix.cuh
│   │   │   ├── exllama_ext.cpp
│   │   │   ├── hip_compat.cuh
│   │   │   ├── matrix.cuh
│   │   │   ├── tuning.h
│   │   │   └── util.cuh
│   │   └── setup.py
│   ├── exllamav2_kernels/
│   │   ├── exllamav2_kernels/
│   │   │   ├── config.h
│   │   │   ├── cpp/
│   │   │   │   └── util.h
│   │   │   ├── cuda/
│   │   │   │   ├── compat.cuh
│   │   │   │   ├── matrix_view.cuh
│   │   │   │   ├── q_gemm.cu
│   │   │   │   ├── q_gemm.cuh
│   │   │   │   ├── q_gemm_kernel.cuh
│   │   │   │   ├── q_gemm_kernel_gptq.cuh
│   │   │   │   ├── q_matrix.cu
│   │   │   │   ├── q_matrix.cuh
│   │   │   │   ├── quant/
│   │   │   │   │   ├── qdq_2.cuh
│   │   │   │   │   ├── qdq_3.cuh
│   │   │   │   │   ├── qdq_4.cuh
│   │   │   │   │   ├── qdq_5.cuh
│   │   │   │   │   ├── qdq_6.cuh
│   │   │   │   │   ├── qdq_8.cuh
│   │   │   │   │   └── qdq_util.cuh
│   │   │   │   └── util.cuh
│   │   │   └── ext.cpp
│   │   └── setup.py
│   ├── pyproject.toml
│   ├── req.txt
│   ├── requirements_cuda.txt
│   ├── requirements_gen.txt
│   ├── requirements_intel.txt
│   ├── requirements_rocm.txt
│   ├── tests/
│   │   ├── conftest.py
│   │   ├── models/
│   │   │   ├── test_bloom.py
│   │   │   ├── test_causal_lm.py
│   │   │   ├── test_model.py
│   │   │   ├── test_santacoder.py
│   │   │   └── test_seq2seq_lm.py
│   │   └── utils/
│   │       ├── test_adapter.py
│   │       ├── test_convert.py
│   │       ├── test_hub.py
│   │       ├── test_layers.py
│   │       ├── test_tokens.py
│   │       ├── test_watermark.py
│   │       └── test_weights.py
│   └── text_generation_server/
│       ├── __init__.py
│       ├── adapters/
│       │   ├── __init__.py
│       │   ├── config.py
│       │   ├── lora.py
│       │   └── weights.py
│       ├── cache.py
│       ├── cli.py
│       ├── interceptor.py
│       ├── layers/
│       │   ├── __init__.py
│       │   ├── attention/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── cuda.py
│       │   │   ├── flash_attn_triton.py
│       │   │   ├── flashinfer.py
│       │   │   ├── ipex.py
│       │   │   ├── kv_cache.py
│       │   │   └── rocm.py
│       │   ├── awq/
│       │   │   ├── conversion_utils.py
│       │   │   └── quantize/
│       │   │       ├── __init__.py
│       │   │       ├── cuda.py
│       │   │       └── ipex.py
│       │   ├── bnb.py
│       │   ├── compressed_tensors/
│       │   │   ├── __init__.py
│       │   │   ├── loader.py
│       │   │   ├── w8a8_int.py
│       │   │   ├── w8an_fp.py
│       │   │   ├── wna16_int.py
│       │   │   └── wna16_int_24.py
│       │   ├── conv.py
│       │   ├── eetq.py
│       │   ├── exl2.py
│       │   ├── fp8.py
│       │   ├── gptq/
│       │   │   ├── __init__.py
│       │   │   ├── custom_autotune.py
│       │   │   ├── exllama.py
│       │   │   ├── exllamav2.py
│       │   │   ├── ipex.py
│       │   │   ├── quantize.py
│       │   │   ├── triton.py
│       │   │   └── utils.py
│       │   ├── layernorm.py
│       │   ├── linear.py
│       │   ├── lora.py
│       │   ├── marlin/
│       │   │   ├── __init__.py
│       │   │   ├── fp8.py
│       │   │   ├── gptq.py
│       │   │   ├── marlin.py
│       │   │   └── util.py
│       │   ├── medusa.py
│       │   ├── mlp.py
│       │   ├── moe/
│       │   │   ├── __init__.py
│       │   │   ├── fp8.py
│       │   │   ├── fused_moe_ipex.py
│       │   │   ├── gptq_marlin.py
│       │   │   └── unquantized.py
│       │   ├── rotary.py
│       │   ├── speculative.py
│       │   └── tensor_parallel.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── bloom.py
│       │   ├── causal_lm.py
│       │   ├── custom_modeling/
│       │   │   ├── __init__.py
│       │   │   ├── bloom_modeling.py
│       │   │   ├── clip.py
│       │   │   ├── flash_cohere_modeling.py
│       │   │   ├── flash_dbrx_modeling.py
│       │   │   ├── flash_deepseek_v2_modeling.py
│       │   │   ├── flash_deepseek_v3_modeling.py
│       │   │   ├── flash_gemma2_modeling.py
│       │   │   ├── flash_gemma3_modeling.py
│       │   │   ├── flash_gemma_modeling.py
│       │   │   ├── flash_gpt2_modeling.py
│       │   │   ├── flash_gptj_modeling.py
│       │   │   ├── flash_llama_modeling.py
│       │   │   ├── flash_mistral_modeling.py
│       │   │   ├── flash_mixtral_modeling.py
│       │   │   ├── flash_neox_modeling.py
│       │   │   ├── flash_pali_gemma_modeling.py
│       │   │   ├── flash_phi_modeling.py
│       │   │   ├── flash_phi_moe_modeling.py
│       │   │   ├── flash_qwen2_modeling.py
│       │   │   ├── flash_rw_modeling.py
│       │   │   ├── flash_santacoder_modeling.py
│       │   │   ├── flash_starcoder2_modeling.py
│       │   │   ├── gemma3/
│       │   │   │   ├── configuration_gemma3.py
│       │   │   │   ├── image_processing_gemma3.py
│       │   │   │   ├── processing_gemma3.py
│       │   │   │   └── utils.py
│       │   │   ├── idefics2.py
│       │   │   ├── idefics3.py
│       │   │   ├── idefics_config.py
│       │   │   ├── idefics_image_processing.py
│       │   │   ├── idefics_modeling.py
│       │   │   ├── idefics_perceiver.py
│       │   │   ├── idefics_processing.py
│       │   │   ├── idefics_vision.py
│       │   │   ├── llava_next.py
│       │   │   ├── mamba_modeling.py
│       │   │   ├── mllama.py
│       │   │   ├── mpt_modeling.py
│       │   │   ├── neox_modeling.py
│       │   │   ├── opt_modeling.py
│       │   │   ├── phi_modeling.py
│       │   │   ├── qwen2_5_vl.py
│       │   │   ├── qwen2_vl.py
│       │   │   ├── siglip.py
│       │   │   ├── t5_modeling.py
│       │   │   └── vlm.py
│       │   ├── flash_causal_lm.py
│       │   ├── galactica.py
│       │   ├── globals.py
│       │   ├── idefics_causal_lm.py
│       │   ├── mamba.py
│       │   ├── metadata_kernels.py
│       │   ├── mllama_causal_lm.py
│       │   ├── model.py
│       │   ├── seq2seq_lm.py
│       │   ├── transformers_flash_causal_lm.py
│       │   ├── transformers_flash_vlm.py
│       │   ├── types.py
│       │   └── vlm_causal_lm.py
│       ├── pb/
│       │   └── .gitignore
│       ├── server.py
│       ├── tracing.py
│       └── utils/
│           ├── __init__.py
│           ├── adapter.py
│           ├── chunks.py
│           ├── convert.py
│           ├── dist.py
│           ├── hub.py
│           ├── import_utils.py
│           ├── kernels.py
│           ├── log.py
│           ├── logits_process.py
│           ├── merges/
│           │   ├── strategies.py
│           │   └── utils.py
│           ├── peft.py
│           ├── prefill_chunking.py
│           ├── quantization.py
│           ├── segments.py
│           ├── speculate.py
│           ├── tokens.py
│           ├── watermark.py
│           └── weights.py
├── tgi-entrypoint.sh
└── update_doc.py