gitextract_8m4rfrdr/

├── .claude/
│   └── skills/
│       ├── add-jit-kernel/
│       │   └── SKILL.md
│       ├── add-sgl-kernel/
│       │   └── SKILL.md
│       ├── sglang-bisect-ci-regression/
│       │   └── SKILL.md
│       └── write-sglang-test/
│           └── SKILL.md
├── .codespellrc
├── .coveragerc
├── .devcontainer/
│   ├── Dockerfile
│   └── devcontainer.json
├── .github/
│   ├── CI_PERMISSIONS.json
│   ├── CODEOWNERS
│   ├── FOLDER_README.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── 1-bug-report.yml
│   │   └── 2-feature-request.yml
│   ├── MAINTAINER.md
│   ├── actions/
│   │   ├── upload-cuda-coredumps/
│   │   │   └── action.yml
│   │   └── wait-for-jobs/
│   │       └── action.yml
│   ├── labeler.yml
│   ├── pull_request_template.md
│   ├── update_ci_permission.py
│   └── workflows/
│       ├── amd-aiter-scout.yml
│       ├── amd-ci-job-monitor.yml
│       ├── auto-tune.yml
│       ├── bot-bump-flashinfer-version.yml
│       ├── bot-bump-kernel-version-to-sglang.yml
│       ├── bot-bump-kernel-version.yml
│       ├── bot-bump-sglang-version.yml
│       ├── bot-cherry-pick.yml
│       ├── cancel-pr-workflow-on-merge.yml
│       ├── cancel-unfinished-pr-tests.yml
│       ├── ci-coverage-overview.yml
│       ├── ci-failure-monitor.yml
│       ├── close-inactive-issues.yml
│       ├── diffusion-ci-gt-gen.yml
│       ├── execute-notebook.yml
│       ├── labeler.yml
│       ├── lint.yml
│       ├── list-active-pr-runs.yml.yml
│       ├── nightly-release-gateway.yml
│       ├── nightly-test-amd-rocm720.yml
│       ├── nightly-test-amd.yml
│       ├── nightly-test-intel.yml
│       ├── nightly-test-npu.yml
│       ├── nightly-test-nvidia.yml
│       ├── open-pr-copy-from-oss.yml
│       ├── open-pr-copy-to-oss.yml
│       ├── patch-docker-dev.yml
│       ├── pr-benchmark-rust.yml
│       ├── pr-gate.yml
│       ├── pr-test-amd-rocm720.yml
│       ├── pr-test-amd.yml
│       ├── pr-test-npu.yml
│       ├── pr-test-rust.yml
│       ├── pr-test-xeon.yml
│       ├── pr-test-xpu.yml
│       ├── pr-test.yml
│       ├── release-branch-cut.yml
│       ├── release-docker-amd-nightly.yml
│       ├── release-docker-amd-rocm720-nightly.yml
│       ├── release-docker-amd.yml
│       ├── release-docker-cu13-framework.yml
│       ├── release-docker-dev.yml
│       ├── release-docker-gateway.yml
│       ├── release-docker-npu-nightly.yml
│       ├── release-docker-npu.yml
│       ├── release-docker-xeon.yml
│       ├── release-docker.yml
│       ├── release-docs.yml
│       ├── release-pypi-gateway.yml
│       ├── release-pypi-nightly.yml
│       ├── release-pypi-pr.yml
│       ├── release-pypi.yml
│       ├── release-tag.yml
│       ├── release-whl-kernel.yml
│       ├── rerun-ut.yml
│       ├── retag-docker.yml
│       ├── runner-utilization.yml
│       ├── slash-command-handler.yml
│       ├── stress-test.yml
│       └── weekly-test-nvidia.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── 3rdparty/
│   └── amd/
│       ├── profiling/
│       │   ├── PROFILING.md
│       │   ├── client.sh
│       │   ├── install_rpd.sh
│       │   ├── loadTracer.sh
│       │   ├── rpd.patch
│       │   ├── rpd_profile_server_enable.patch
│       │   ├── rpd_profile_server_enable_wCPU_activities.patch
│       │   ├── server.sh
│       │   └── torch_profiler.patch
│       ├── tuning/
│       │   ├── TUNING.md
│       │   └── benchmark_moe_rocm.py
│       └── wheel/
│           ├── README.md
│           ├── sgl-kernel/
│           │   ├── CMakeLists_rocm.txt
│           │   ├── build_rocm.sh
│           │   ├── rename_wheels_rocm.sh
│           │   └── rocm_hipify.py
│           └── sglang/
│               └── pyproject.toml
├── LICENSE
├── README.md
├── benchmark/
│   ├── asr/
│   │   ├── README.md
│   │   └── bench_sglang.py
│   ├── bench_attention_sink/
│   │   └── bench_attention_sink_triton.py
│   ├── bench_in_batch_prefix/
│   │   └── bench_in_batch_prefix.py
│   ├── bench_linear_attention/
│   │   ├── bench_gdn_decode.py
│   │   └── bench_gdn_prefill.py
│   ├── bench_rope/
│   │   └── benchmark_rope_index.py
│   ├── benchmark_batch/
│   │   ├── benchmark_batch.py
│   │   └── benchmark_tokenizer.py
│   ├── benchmark_vllm_060/
│   │   └── README.md
│   ├── blog_v0_2/
│   │   ├── 405b_sglang.sh
│   │   ├── 405b_trt.sh
│   │   ├── 405b_vllm.sh
│   │   ├── README.md
│   │   └── config.md
│   ├── boolq/
│   │   ├── README.md
│   │   ├── bench_sglang.py
│   │   ├── convert_parquet_to_json.py
│   │   └── parquet_to_json.sh
│   ├── ceval/
│   │   ├── README.md
│   │   └── bench_sglang.py
│   ├── deepseek_v3/
│   │   └── README.md
│   ├── dspy/
│   │   ├── README.md
│   │   └── bench_dspy_intro.py
│   ├── fla/
│   │   └── benchmark_layernorm_gated.py
│   ├── generative_agents/
│   │   ├── README.md
│   │   ├── agent_functions.py
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── gpt_oss/
│   │   └── README.md
│   ├── gsm8k/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── hellaswag/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── hf3fs/
│   │   ├── bench.sh
│   │   ├── bench_client.py
│   │   ├── bench_storage.py
│   │   └── bench_zerocopy.py
│   ├── hicache/
│   │   ├── README.md
│   │   ├── bench_long_context.py
│   │   ├── bench_mix.py
│   │   ├── bench_mix.sh
│   │   ├── bench_multiturn.py
│   │   ├── bench_serving.py
│   │   ├── data_processing.py
│   │   ├── download.sh
│   │   ├── nextqa.py
│   │   └── perf.py
│   ├── json_decode_regex/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── build_dataset.py
│   ├── json_jump_forward/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   ├── build_dataset.py
│   │   └── dataset.txt
│   ├── json_schema/
│   │   ├── README.md
│   │   └── bench_sglang.py
│   ├── kernels/
│   │   ├── all_reduce/
│   │   │   ├── benchmark_aiter.py
│   │   │   ├── benchmark_all_reduce.py
│   │   │   ├── benchmark_fused_ar_rms_amd.py
│   │   │   ├── benchmark_mscclpp.py
│   │   │   └── benchmark_torch_symm_mem.py
│   │   ├── decoding_attention_triton/
│   │   │   └── triton_flashinfer_cudnn.py
│   │   ├── deepep/
│   │   │   ├── deepep_utils.py
│   │   │   └── tuning_deepep.py
│   │   ├── deepseek/
│   │   │   ├── README.md
│   │   │   ├── benchmark_deepgemm_fp8_gemm.py
│   │   │   ├── benchmark_deepgemm_fp8_gemm_blackwell.py
│   │   │   └── benchmark_deepgemm_fp8_group_gemm.py
│   │   ├── elementwise/
│   │   │   └── benchmark_concat_mla.py
│   │   ├── flashinfer_allreduce_fusion/
│   │   │   ├── README.md
│   │   │   └── benchmark_fused_collective.py
│   │   ├── fused_moe_triton/
│   │   │   ├── README.md
│   │   │   ├── benchmark_sglang_fused_moe_triton.py
│   │   │   ├── benchmark_torch_compile_fused_moe.py
│   │   │   ├── benchmark_vllm_vs_sglang_fused_moe_triton.py
│   │   │   ├── common_utils.py
│   │   │   ├── tuning_client.py
│   │   │   ├── tuning_fused_moe_triton.py
│   │   │   ├── tuning_fused_moe_triton_sep.py
│   │   │   └── tuning_text.json
│   │   ├── quantization/
│   │   │   ├── README.md
│   │   │   ├── bench_fp4_quant.py
│   │   │   ├── bench_int8_quant.py
│   │   │   └── tuning_block_wise_kernel.py
│   │   ├── scheduler_batch/
│   │   │   ├── benchmark_get_last_loc_triton.py
│   │   │   └── benchmark_write_req_to_token_pool_triton.py
│   │   └── sliding_window_attention_triton/
│   │       └── bench_triton_swa_kernel.py
│   ├── line_retrieval/
│   │   ├── README.md
│   │   ├── bench_sglang.py
│   │   └── gen_data.py
│   ├── llava_bench/
│   │   ├── README.md
│   │   ├── bench_hf_llava_bench.sh
│   │   ├── bench_hf_mme.sh
│   │   ├── bench_sglang.py
│   │   ├── bench_sglang_mme.sh
│   │   └── download_images.py
│   ├── llm_judge/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── long_json_decode/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── build_dataset.py
│   ├── lora/
│   │   ├── launch_server.py
│   │   └── lora_bench.py
│   ├── mmlu/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── download_data.sh
│   ├── mmmu/
│   │   ├── README.md
│   │   ├── bench_hf.py
│   │   ├── bench_sglang.py
│   │   ├── data_utils.py
│   │   ├── eval_utils.py
│   │   └── prompt_format.yaml
│   ├── mtbench/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── bench_sglang_eagle.py
│   ├── multi_chain_reasoning/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── multi_document_qa/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── build_dataset.py
│   ├── multi_turn_chat/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   ├── data_gen.py
│   │   └── long_prompt_multi_turn.py
│   ├── prefill_only/
│   │   ├── bench_embeddings.py
│   │   ├── bench_score.py
│   │   └── util.py
│   ├── react/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   └── bench_sglang.py
│   ├── reasoning_benchmark/
│   │   ├── README.md
│   │   ├── answer_extraction.py
│   │   ├── bench_sglang.py
│   │   └── eval_utils.py
│   ├── tip_suggestion/
│   │   ├── .gitignore
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   ├── lmql_funcs.py
│   │   └── topic.jsonl
│   ├── tree_of_thought_deep/
│   │   ├── README.md
│   │   ├── bench_other.py
│   │   ├── bench_sglang.py
│   │   └── lmql_funcs.py
│   └── tree_of_thought_v0/
│       ├── README.md
│       ├── bench_other.py
│       └── bench_sglang.py
├── docker/
│   ├── Dockerfile
│   ├── compose.yaml
│   ├── configs/
│   │   ├── .zshrc
│   │   ├── opt/
│   │   │   ├── .gitconfig
│   │   │   ├── .tmux.conf
│   │   │   └── .vimrc
│   │   └── yank
│   ├── diffusion.Dockerfile
│   ├── gateway.Dockerfile
│   ├── k8s-sglang-distributed-sts.yaml
│   ├── k8s-sglang-service.yaml
│   ├── npu.Dockerfile
│   ├── rocm.Dockerfile
│   ├── sagemaker.Dockerfile
│   ├── serve
│   ├── xeon.Dockerfile
│   └── xpu.Dockerfile
├── docs/
│   ├── Makefile
│   ├── README.md
│   ├── _static/
│   │   └── css/
│   │       ├── custom_log.css
│   │       └── readthedocs.css
│   ├── advanced_features/
│   │   ├── attention_backend.md
│   │   ├── checkpoint_engine.md
│   │   ├── cuda_graph_for_multi_modal_encoder.md
│   │   ├── deterministic_inference.md
│   │   ├── dp_dpa_smg_guide.md
│   │   ├── dp_for_multi_modal_encoder.md
│   │   ├── epd_disaggregation.md
│   │   ├── expert_parallelism.md
│   │   ├── forward_hooks.md
│   │   ├── hicache.rst
│   │   ├── hicache_best_practices.md
│   │   ├── hicache_design.md
│   │   ├── hicache_storage_runtime_attach_detach.md
│   │   ├── hyperparameter_tuning.md
│   │   ├── lora.ipynb
│   │   ├── observability.md
│   │   ├── pd_disaggregation.md
│   │   ├── piecewise_cuda_graph.md
│   │   ├── pipeline_parallelism.md
│   │   ├── quantization.md
│   │   ├── quantized_kv_cache.md
│   │   ├── rfork.md
│   │   ├── separate_reasoning.ipynb
│   │   ├── server_arguments.md
│   │   ├── sgl_model_gateway.md
│   │   ├── sglang_for_rl.md
│   │   ├── speculative_decoding.md
│   │   ├── structured_outputs.ipynb
│   │   ├── structured_outputs_for_reasoning_models.ipynb
│   │   ├── tool_parser.ipynb
│   │   └── vlm_query.ipynb
│   ├── basic_usage/
│   │   ├── deepseek_ocr.md
│   │   ├── deepseek_v3.md
│   │   ├── deepseek_v32.md
│   │   ├── glm45.md
│   │   ├── glmv.md
│   │   ├── gpt_oss.md
│   │   ├── llama4.md
│   │   ├── minimax_m2.md
│   │   ├── native_api.ipynb
│   │   ├── offline_engine_api.ipynb
│   │   ├── ollama_api.md
│   │   ├── openai_api.rst
│   │   ├── openai_api_completions.ipynb
│   │   ├── openai_api_embeddings.ipynb
│   │   ├── openai_api_vision.ipynb
│   │   ├── popular_model_usage.rst
│   │   ├── qwen3.md
│   │   ├── qwen3_5.md
│   │   ├── qwen3_vl.md
│   │   ├── sampling_params.md
│   │   └── send_request.ipynb
│   ├── conf.py
│   ├── deploy.py
│   ├── developer_guide/
│   │   ├── bench_serving.md
│   │   ├── benchmark_and_profiling.md
│   │   ├── contribution_guide.md
│   │   ├── development_guide_using_docker.md
│   │   ├── development_jit_kernel_guide.md
│   │   ├── evaluating_new_models.md
│   │   ├── release_process.md
│   │   └── setup_github_runner.md
│   ├── diffusion/
│   │   ├── api/
│   │   │   ├── cli.md
│   │   │   ├── openai_api.md
│   │   │   └── post_processing.md
│   │   ├── ci_perf.md
│   │   ├── compatibility_matrix.md
│   │   ├── contributing.md
│   │   ├── environment_variables.md
│   │   ├── index.md
│   │   ├── installation.md
│   │   ├── performance/
│   │   │   ├── attention_backends.md
│   │   │   ├── cache/
│   │   │   │   ├── cache_dit.md
│   │   │   │   ├── index.md
│   │   │   │   └── teacache.md
│   │   │   ├── index.md
│   │   │   └── profiling.md
│   │   └── support_new_models.md
│   ├── get_started/
│   │   └── install.md
│   ├── index.rst
│   ├── performance_dashboard/
│   │   ├── README.md
│   │   ├── app.js
│   │   ├── fetch_metrics.py
│   │   ├── index.html
│   │   └── server.py
│   ├── platforms/
│   │   ├── amd_gpu.md
│   │   ├── apple_metal.md
│   │   ├── ascend_contribution_guide.md
│   │   ├── ascend_npu.md
│   │   ├── ascend_npu_best_practice.md
│   │   ├── ascend_npu_deepseek_example.md
│   │   ├── ascend_npu_environment_variables.md
│   │   ├── ascend_npu_glm5_examples.md
│   │   ├── ascend_npu_quantization.md
│   │   ├── ascend_npu_qwen3_5_examples.md
│   │   ├── ascend_npu_qwen3_examples.md
│   │   ├── ascend_npu_support.rst
│   │   ├── ascend_npu_support_features.md
│   │   ├── ascend_npu_support_models.md
│   │   ├── cpu_server.md
│   │   ├── mindspore_backend.md
│   │   ├── mthreads_gpu.md
│   │   ├── nvidia_jetson.md
│   │   ├── tpu.md
│   │   └── xpu.md
│   ├── references/
│   │   ├── custom_chat_template.md
│   │   ├── environment_variables.md
│   │   ├── faq.md
│   │   ├── frontend/
│   │   │   ├── choices_methods.md
│   │   │   ├── frontend_index.rst
│   │   │   └── frontend_tutorial.ipynb
│   │   ├── learn_more.md
│   │   ├── multi_node_deployment/
│   │   │   ├── deploy_on_k8s.md
│   │   │   ├── lws_pd/
│   │   │   │   ├── lws-examples/
│   │   │   │   │   ├── d-svc.yaml
│   │   │   │   │   ├── d.yaml
│   │   │   │   │   ├── lb.yaml
│   │   │   │   │   ├── p-svc.yaml
│   │   │   │   │   └── p.yaml
│   │   │   │   └── lws_pd_deploy.md
│   │   │   ├── multi_node.md
│   │   │   ├── multi_node_index.rst
│   │   │   └── rbg_pd/
│   │   │       └── deepseekv32_pd.md
│   │   ├── post_training_integration.md
│   │   ├── production_metrics.md
│   │   ├── production_request_trace.md
│   │   ├── release_lookup.rst
│   │   └── torch_compile_cache.md
│   ├── release_lookup/
│   │   ├── README.md
│   │   ├── generate_index.py
│   │   ├── index.html
│   │   └── release_index.json
│   ├── requirements.txt
│   ├── serve.sh
│   ├── supported_models/
│   │   ├── extending/
│   │   │   ├── index.rst
│   │   │   ├── mindspore_models.md
│   │   │   ├── modelscope.md
│   │   │   ├── support_new_models.md
│   │   │   └── transformers_fallback.md
│   │   ├── index.rst
│   │   ├── retrieval_ranking/
│   │   │   ├── classify_models.md
│   │   │   ├── embedding_models.md
│   │   │   ├── index.rst
│   │   │   └── rerank_models.md
│   │   ├── specialized/
│   │   │   ├── index.rst
│   │   │   └── reward_models.md
│   │   └── text_generation/
│   │       ├── diffusion_language_models.md
│   │       ├── generative_models.md
│   │       ├── index.rst
│   │       └── multimodal_language_models.md
│   └── wrap_run_llm.py
├── examples/
│   ├── assets/
│   │   └── .gitignore
│   ├── chat_template/
│   │   ├── qwen3_reranker.jinja
│   │   ├── qwen3_vl_reranker.jinja
│   │   ├── tool_chat_template_deepseekr1.jinja
│   │   ├── tool_chat_template_deepseekv3.jinja
│   │   ├── tool_chat_template_deepseekv31.jinja
│   │   ├── tool_chat_template_deepseekv32.jinja
│   │   ├── tool_chat_template_llama3.1_json.jinja
│   │   ├── tool_chat_template_llama4_pythonic.jinja
│   │   └── vision_template_sarashina_vl.jinja
│   ├── checkpoint_engine/
│   │   └── update.py
│   ├── frontend_language/
│   │   ├── quick_start/
│   │   │   ├── anthropic_example_chat.py
│   │   │   ├── anthropic_example_complete.py
│   │   │   ├── azure_openai_example_chat.py
│   │   │   ├── gemini_example_chat.py
│   │   │   ├── gemini_example_complete.py
│   │   │   ├── gemini_example_multimodal_chat.py
│   │   │   ├── local_example_chat.py
│   │   │   ├── local_example_complete.py
│   │   │   ├── local_example_llava_next.py
│   │   │   ├── openai_example_chat.py
│   │   │   ├── openai_example_complete.py
│   │   │   ├── openai_example_n.py
│   │   │   ├── openai_example_o1.py
│   │   │   ├── openrouter_example_chat.py
│   │   │   ├── together_example_chat.py
│   │   │   └── together_example_complete.py
│   │   └── usage/
│   │       ├── chinese_regex.py
│   │       ├── choices_logprob.py
│   │       ├── cot_decoding.py
│   │       ├── json_decode.py
│   │       ├── json_logprobs.py
│   │       ├── llava_video/
│   │       │   ├── srt_example_llava_v.py
│   │       │   └── srt_example_llava_v.sh
│   │       ├── openai_chat_speculative.py
│   │       ├── openai_speculative.py
│   │       ├── parallel_sample.py
│   │       ├── rag_using_parea/
│   │       │   └── trace_and_evaluate_rag_using_parea.ipynb
│   │       ├── readme_examples.py
│   │       ├── sgl_gen_min_tokens.py
│   │       ├── streaming.py
│   │       └── triton/
│   │           ├── Dockerfile
│   │           ├── README.md
│   │           └── models/
│   │               └── character_generation/
│   │                   ├── 1/
│   │                   │   └── model.py
│   │                   └── config.pbtxt
│   ├── monitoring/
│   │   ├── README.md
│   │   ├── docker-compose.yaml
│   │   ├── grafana/
│   │   │   ├── dashboards/
│   │   │   │   ├── config/
│   │   │   │   │   └── dashboard.yaml
│   │   │   │   └── json/
│   │   │   │       └── sglang-dashboard.json
│   │   │   └── datasources/
│   │   │       └── datasource.yaml
│   │   ├── opentelemetry.yaml
│   │   ├── prometheus.yaml
│   │   └── tracing_compose.yaml
│   ├── profiler/
│   │   └── nsys_profile_tools/
│   │       ├── README.md
│   │       ├── gputrc2graph.py
│   │       └── sglang_engine_model.json
│   ├── runtime/
│   │   ├── README.md
│   │   ├── engine/
│   │   │   ├── custom_server.py
│   │   │   ├── embedding.py
│   │   │   ├── fastapi_engine_inference.py
│   │   │   ├── launch_engine.py
│   │   │   ├── offline_batch_inference.py
│   │   │   ├── offline_batch_inference_async.py
│   │   │   ├── offline_batch_inference_eagle.py
│   │   │   ├── offline_batch_inference_qwen_1m.py
│   │   │   ├── offline_batch_inference_vlm.py
│   │   │   ├── readme.md
│   │   │   ├── save_remote_state.py
│   │   │   └── save_sharded_state.py
│   │   ├── hidden_states/
│   │   │   ├── hidden_states_engine.py
│   │   │   └── hidden_states_server.py
│   │   ├── lora.py
│   │   ├── multimodal/
│   │   │   ├── llama3_llava_server.py
│   │   │   ├── llava_onevision_server.py
│   │   │   ├── pixtral_server.py
│   │   │   └── qwen_llava_server.py
│   │   ├── multimodal_embedding.py
│   │   ├── openai_chat_with_response_prefill.py
│   │   ├── qwen3_vl_reranker.py
│   │   ├── reward_model.py
│   │   ├── token_in_token_out/
│   │   │   ├── token_in_token_out_llm_engine.py
│   │   │   ├── token_in_token_out_llm_server.py
│   │   │   ├── token_in_token_out_vlm_engine.py
│   │   │   └── token_in_token_out_vlm_server.py
│   │   └── vertex_predict.py
│   ├── sagemaker/
│   │   └── deploy_and_serve_endpoint.py
│   └── usage/
│       └── modelopt_quantize_and_export.py
├── python/
│   ├── pyproject.toml
│   ├── pyproject_cpu.toml
│   ├── pyproject_npu.toml
│   ├── pyproject_other.toml
│   ├── pyproject_xpu.toml
│   └── sglang/
│       ├── README.md
│       ├── __init__.py
│       ├── _mps_stub.py
│       ├── _triton_stub.py
│       ├── bench_offline_throughput.py
│       ├── bench_one_batch.py
│       ├── bench_one_batch_server.py
│       ├── bench_serving.py
│       ├── benchmark/
│       │   ├── __init__.py
│       │   ├── bench_utils.py
│       │   ├── datasets/
│       │   │   ├── __init__.py
│       │   │   ├── common.py
│       │   │   ├── custom.py
│       │   │   ├── generated_shared_prefix.py
│       │   │   ├── image.py
│       │   │   ├── mmmu.py
│       │   │   ├── mooncake.py
│       │   │   ├── openai_dataset.py
│       │   │   ├── random.py
│       │   │   └── sharegpt.py
│       │   └── utils.py
│       ├── check_env.py
│       ├── cli/
│       │   ├── __init__.py
│       │   ├── generate.py
│       │   ├── main.py
│       │   ├── serve.py
│       │   └── utils.py
│       ├── compile_deep_gemm.py
│       ├── eval/
│       │   ├── llama3_eval.py
│       │   └── loogle_eval.py
│       ├── global_config.py
│       ├── jit_kernel/
│       │   ├── .clang-format
│       │   ├── __init__.py
│       │   ├── __main__.py
│       │   ├── add_constant.py
│       │   ├── awq_dequantize.py
│       │   ├── awq_marlin_repack.py
│       │   ├── benchmark/
│       │   │   ├── bench_awq_dequantize.py
│       │   │   ├── bench_awq_marlin_moe_repack.py
│       │   │   ├── bench_awq_marlin_repack.py
│       │   │   ├── bench_concat_mla.py
│       │   │   ├── bench_fused_add_rmsnorm.py
│       │   │   ├── bench_fused_norm_scale_shift.py
│       │   │   ├── bench_gptq_marlin.py
│       │   │   ├── bench_gptq_marlin_repack.py
│       │   │   ├── bench_hadamard.py
│       │   │   ├── bench_hicache.py
│       │   │   ├── bench_moe_wna16_marlin.py
│       │   │   ├── bench_norm.py
│       │   │   ├── bench_norm_impls.py
│       │   │   ├── bench_nvfp4_blockwise_moe.py
│       │   │   ├── bench_nvfp4_quant.py
│       │   │   ├── bench_nvfp4_scaled_mm.py
│       │   │   ├── bench_per_tensor_quant_fp8.py
│       │   │   ├── bench_per_token_group_quant_8bit.py
│       │   │   ├── bench_qknorm.py
│       │   │   ├── bench_qknorm_across_heads.py
│       │   │   ├── bench_qwen_image_modulation.py
│       │   │   ├── bench_renorm.py
│       │   │   ├── bench_rmsnorm.py
│       │   │   ├── bench_rope.py
│       │   │   ├── bench_store_cache.py
│       │   │   └── utils.py
│       │   ├── concat_mla.py
│       │   ├── csrc/
│       │   │   ├── add_constant.cuh
│       │   │   ├── diffusion/
│       │   │   │   └── timestep_embedding.cuh
│       │   │   ├── elementwise/
│       │   │   │   ├── concat_mla.cuh
│       │   │   │   ├── fused_add_rmsnorm.cuh
│       │   │   │   ├── fused_metadata_copy.cuh
│       │   │   │   ├── kvcache.cuh
│       │   │   │   ├── pos_enc.cuh
│       │   │   │   ├── qknorm.cuh
│       │   │   │   ├── qknorm_across_heads.cuh
│       │   │   │   ├── rmsnorm.cuh
│       │   │   │   └── rope.cuh
│       │   │   ├── fast-hadamard-transform/
│       │   │   │   ├── code_gen.py
│       │   │   │   ├── fast_hadamard_transform.h
│       │   │   │   ├── fast_hadamard_transform_common.h
│       │   │   │   ├── fast_hadamard_transform_special.h
│       │   │   │   ├── hadamard_jit.cuh
│       │   │   │   └── static_switch.h
│       │   │   ├── gemm/
│       │   │   │   ├── awq_dequantize.cuh
│       │   │   │   ├── marlin/
│       │   │   │   │   ├── awq_marlin_repack.cuh
│       │   │   │   │   ├── dequant.h
│       │   │   │   │   ├── gptq_marlin.cuh
│       │   │   │   │   ├── gptq_marlin_repack.cuh
│       │   │   │   │   ├── kernel.h
│       │   │   │   │   ├── marlin.cuh
│       │   │   │   │   ├── marlin_dtypes.cuh
│       │   │   │   │   └── marlin_template.h
│       │   │   │   ├── marlin_moe/
│       │   │   │   │   ├── kernel.h
│       │   │   │   │   ├── marlin_template.h
│       │   │   │   │   └── moe_wna16_marlin.cuh
│       │   │   │   ├── nvfp4/
│       │   │   │   │   ├── nvfp4_expert_quant.cuh
│       │   │   │   │   ├── nvfp4_quant.cuh
│       │   │   │   │   ├── nvfp4_quant_entry.cuh
│       │   │   │   │   ├── nvfp4_quant_kernels.cuh
│       │   │   │   │   ├── nvfp4_scaled_mm_entry.cuh
│       │   │   │   │   └── nvfp4_scaled_mm_kernels.cuh
│       │   │   │   ├── per_tensor_quant_fp8.cuh
│       │   │   │   └── per_token_group_quant_8bit.cuh
│       │   │   ├── hicache.cuh
│       │   │   ├── lora/
│       │   │   │   └── moe_lora_align_kernel.cu
│       │   │   ├── moe/
│       │   │   │   └── nvfp4_blockwise_moe.cuh
│       │   │   ├── ngram_embedding.cuh
│       │   │   └── nsa/
│       │   │       └── fused_store_index_cache.cuh
│       │   ├── cutedsl_gdn.py
│       │   ├── diffusion/
│       │   │   ├── cutedsl/
│       │   │   │   ├── common/
│       │   │   │   │   ├── norm_fusion.py
│       │   │   │   │   └── reduce.py
│       │   │   │   ├── scale_residual_norm_scale_shift.py
│       │   │   │   └── utils.py
│       │   │   └── triton/
│       │   │       ├── mps_fallback.py
│       │   │       ├── norm.py
│       │   │       ├── npu_fallback.py
│       │   │       ├── rmsnorm_onepass.py
│       │   │       ├── rotary.py
│       │   │       └── scale_shift.py
│       │   ├── flash_attention_v4.py
│       │   ├── fused_metadata_copy.py
│       │   ├── fused_store_index_cache.py
│       │   ├── gptq_marlin.py
│       │   ├── gptq_marlin_repack.py
│       │   ├── hadamard.py
│       │   ├── hicache.py
│       │   ├── include/
│       │   │   └── sgl_kernel/
│       │   │       ├── atomic.cuh
│       │   │       ├── cta.cuh
│       │   │       ├── impl/
│       │   │       │   └── norm.cuh
│       │   │       ├── math.cuh
│       │   │       ├── runtime.cuh
│       │   │       ├── scalar_type.hpp
│       │   │       ├── source_location.h
│       │   │       ├── tensor.h
│       │   │       ├── tile.cuh
│       │   │       ├── type.cuh
│       │   │       ├── utils.cuh
│       │   │       ├── utils.h
│       │   │       ├── vec.cuh
│       │   │       └── warp.cuh
│       │   ├── kvcache.py
│       │   ├── moe_lora_align.py
│       │   ├── moe_wna16_marlin.py
│       │   ├── ngram_embedding.py
│       │   ├── norm.py
│       │   ├── nvfp4.py
│       │   ├── per_tensor_quant_fp8.py
│       │   ├── per_token_group_quant_8bit.py
│       │   ├── rope.py
│       │   ├── tests/
│       │   │   ├── test_add_constant.py
│       │   │   ├── test_awq_dequantize.py
│       │   │   ├── test_awq_marlin_moe_repack.py
│       │   │   ├── test_awq_marlin_repack.py
│       │   │   ├── test_concat_mla.py
│       │   │   ├── test_cutedsl_gdn.py
│       │   │   ├── test_flash_attention_4.py
│       │   │   ├── test_fused_add_rmsnorm.py
│       │   │   ├── test_fused_metadata_copy.py
│       │   │   ├── test_fused_norm_scale_shift.py
│       │   │   ├── test_fused_store_index_cache.py
│       │   │   ├── test_fused_verify_triton_gdn.py
│       │   │   ├── test_gptq_marlin.py
│       │   │   ├── test_gptq_marlin_repack.py
│       │   │   ├── test_hadamard_jit.py
│       │   │   ├── test_moe_lora_align_block_size.py
│       │   │   ├── test_moe_wna16_marlin.py
│       │   │   ├── test_norm_jit.py
│       │   │   ├── test_nvfp4_blockwise_moe.py
│       │   │   ├── test_nvfp4_gemm.py
│       │   │   ├── test_nvfp4_quant.py
│       │   │   ├── test_per_tensor_quant_fp8.py
│       │   │   ├── test_per_token_group_quant_8bit.py
│       │   │   ├── test_pos_enc.py
│       │   │   ├── test_qknorm.py
│       │   │   ├── test_qknorm_across_heads.py
│       │   │   ├── test_qwen_image_modulation.py
│       │   │   ├── test_renorm.py
│       │   │   ├── test_rmsnorm.py
│       │   │   ├── test_rope.py
│       │   │   ├── test_store_cache.py
│       │   │   └── test_timestep_embedding.py
│       │   ├── timestep_embedding.py
│       │   └── utils.py
│       ├── lang/
│       │   ├── api.py
│       │   ├── backend/
│       │   │   ├── anthropic.py
│       │   │   ├── base_backend.py
│       │   │   ├── litellm.py
│       │   │   ├── openai.py
│       │   │   ├── runtime_endpoint.py
│       │   │   └── vertexai.py
│       │   ├── chat_template.py
│       │   ├── choices.py
│       │   ├── interpreter.py
│       │   ├── ir.py
│       │   └── tracer.py
│       ├── launch_server.py
│       ├── multimodal_gen/
│       │   ├── .claude/
│       │   │   ├── CLAUDE.md
│       │   │   └── skills/
│       │   │       ├── diffusion-kernel/
│       │   │       │   ├── SKILL.md
│       │   │       │   ├── add-cuda-kernel.md
│       │   │       │   ├── add-triton-kernel.md
│       │   │       │   ├── diffusion-benchmark-and-profile.md
│       │   │       │   ├── nsight-profiler.md
│       │   │       │   ├── references/
│       │   │       │   │   ├── a100-optimization-guide.md
│       │   │       │   │   ├── h100-optimization-guide.md
│       │   │       │   │   ├── kernel-templates.md
│       │   │       │   │   ├── t4-optimization-guide.md
│       │   │       │   │   └── troubleshooting.md
│       │   │       │   ├── scripts/
│       │   │       │   │   ├── bench_diffusion_denoise.py
│       │   │       │   │   ├── bench_diffusion_rmsnorm.py
│       │   │       │   │   └── diffusion_skill_env.py
│       │   │       │   └── use-efficient-diffusion-kernels.md
│       │   │       ├── diffusion-optimal-perf/
│       │   │       │   └── SKILL.md
│       │   │       └── support-new-model/
│       │   │           └── SKILL.md
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── apps/
│       │   │   ├── ComfyUI_SGLDiffusion/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── core/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── generator.py
│       │   │   │   │   ├── model_patcher.py
│       │   │   │   │   └── server_api.py
│       │   │   │   ├── executors/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── flux.py
│       │   │   │   │   ├── qwen_image.py
│       │   │   │   │   └── zimage.py
│       │   │   │   ├── nodes.py
│       │   │   │   ├── test/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── test_flux_pipeline.py
│       │   │   │   │   ├── test_qwen_image_edit_pipeline.py
│       │   │   │   │   ├── test_qwen_image_pipeline.py
│       │   │   │   │   └── test_zimage_pipeline.py
│       │   │   │   ├── utils.py
│       │   │   │   └── workflows/
│       │   │   │       ├── flux_sgld_sp.json
│       │   │   │       ├── qwen_image_sgld.json
│       │   │   │       ├── sgld_image2video.json
│       │   │   │       ├── sgld_text2img.json
│       │   │   │       └── z-image_sgld.json
│       │   │   └── webui/
│       │   │       ├── README.md
│       │   │       ├── __init__.py
│       │   │       └── main.py
│       │   ├── benchmarks/
│       │   │   ├── bench_offline_throughput.py
│       │   │   ├── bench_serving.py
│       │   │   ├── compare_perf.py
│       │   │   └── datasets.py
│       │   ├── configs/
│       │   │   ├── __init__.py
│       │   │   ├── backend/
│       │   │   │   └── vmoba/
│       │   │   │       ├── wan_1.3B_77_448_832.json
│       │   │   │       └── wan_1.3B_77_480_832.json
│       │   │   ├── models/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── adapter/
│       │   │   │   │   ├── base.py
│       │   │   │   │   └── ltx_2_connector.py
│       │   │   │   ├── base.py
│       │   │   │   ├── bridges/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── mova_dual_tower.py
│       │   │   │   ├── dits/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── flux.py
│       │   │   │   │   ├── glmimage.py
│       │   │   │   │   ├── helios.py
│       │   │   │   │   ├── hunyuan3d.py
│       │   │   │   │   ├── hunyuanvideo.py
│       │   │   │   │   ├── ltx_2.py
│       │   │   │   │   ├── mova_audio.py
│       │   │   │   │   ├── mova_video.py
│       │   │   │   │   ├── qwenimage.py
│       │   │   │   │   ├── sana.py
│       │   │   │   │   ├── wanvideo.py
│       │   │   │   │   └── zimage.py
│       │   │   │   ├── encoders/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── clip.py
│       │   │   │   │   ├── gemma2.py
│       │   │   │   │   ├── gemma_3.py
│       │   │   │   │   ├── llama.py
│       │   │   │   │   ├── qwen3.py
│       │   │   │   │   ├── qwen_image.py
│       │   │   │   │   └── t5.py
│       │   │   │   ├── vaes/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── dac.py
│       │   │   │   │   ├── flux.py
│       │   │   │   │   ├── glmimage.py
│       │   │   │   │   ├── hunyuan3d.py
│       │   │   │   │   ├── hunyuanvae.py
│       │   │   │   │   ├── ltx_audio.py
│       │   │   │   │   ├── ltx_video.py
│       │   │   │   │   ├── qwenimage.py
│       │   │   │   │   ├── sana.py
│       │   │   │   │   └── wanvae.py
│       │   │   │   └── vocoder/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── base.py
│       │   │   │       └── ltx_vocoder.py
│       │   │   ├── pipeline_configs/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── diffusers_generic.py
│       │   │   │   ├── flux.py
│       │   │   │   ├── flux_finetuned.py
│       │   │   │   ├── glm_image.py
│       │   │   │   ├── helios.py
│       │   │   │   ├── hunyuan.py
│       │   │   │   ├── hunyuan3d.py
│       │   │   │   ├── ltx_2.py
│       │   │   │   ├── mova.py
│       │   │   │   ├── qwen_image.py
│       │   │   │   ├── sana.py
│       │   │   │   ├── wan.py
│       │   │   │   └── zimage.py
│       │   │   ├── quantization.py
│       │   │   ├── sample/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── diffusers_generic.py
│       │   │   │   ├── flux.py
│       │   │   │   ├── glmimage.py
│       │   │   │   ├── helios.py
│       │   │   │   ├── hunyuan.py
│       │   │   │   ├── hunyuan3d.py
│       │   │   │   ├── ltx_2.py
│       │   │   │   ├── mova.py
│       │   │   │   ├── qwenimage.py
│       │   │   │   ├── sampling_params.py
│       │   │   │   ├── sana.py
│       │   │   │   ├── teacache.py
│       │   │   │   ├── wan.py
│       │   │   │   └── zimage.py
│       │   │   └── utils.py
│       │   ├── csrc/
│       │   │   ├── attn/
│       │   │   │   └── vmoba_attn/
│       │   │   │       ├── README.md
│       │   │   │       ├── setup.py
│       │   │   │       ├── tests/
│       │   │   │       │   └── test_vmoba_attn.py
│       │   │   │       └── vmoba/
│       │   │   │           ├── __init__.py
│       │   │   │           └── vmoba.py
│       │   │   └── render/
│       │   │       ├── hunyuan3d_rasterizer/
│       │   │       │   ├── __init__.py
│       │   │       │   ├── rasterizer.cpp
│       │   │       │   ├── rasterizer.h
│       │   │       │   └── rasterizer_gpu.cu
│       │   │       └── mesh_processor/
│       │   │           ├── __init__.py
│       │   │           └── mesh_processor.cpp
│       │   ├── docs/
│       │   │   └── quantization.md
│       │   ├── envs.py
│       │   ├── registry.py
│       │   ├── runtime/
│       │   │   ├── cache/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cache_dit_integration.py
│       │   │   │   └── teacache.py
│       │   │   ├── distributed/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── communication_op.py
│       │   │   │   ├── device_communicators/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base_device_communicator.py
│       │   │   │   │   ├── cpu_communicator.py
│       │   │   │   │   ├── cuda_communicator.py
│       │   │   │   │   ├── pynccl.py
│       │   │   │   │   └── pynccl_wrapper.py
│       │   │   │   ├── group_coordinator.py
│       │   │   │   ├── parallel_groups.py
│       │   │   │   ├── parallel_state.py
│       │   │   │   └── utils.py
│       │   │   ├── entrypoints/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cli/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── cli_types.py
│       │   │   │   │   ├── generate.py
│       │   │   │   │   ├── main.py
│       │   │   │   │   ├── serve.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── diffusion_generator.py
│       │   │   │   ├── http_server.py
│       │   │   │   ├── openai/
│       │   │   │   │   ├── common_api.py
│       │   │   │   │   ├── image_api.py
│       │   │   │   │   ├── mesh_api.py
│       │   │   │   │   ├── protocol.py
│       │   │   │   │   ├── storage.py
│       │   │   │   │   ├── stores.py
│       │   │   │   │   ├── utils.py
│       │   │   │   │   └── video_api.py
│       │   │   │   ├── post_training/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── io_struct.py
│       │   │   │   │   └── weights_api.py
│       │   │   │   └── utils.py
│       │   │   ├── launch_server.py
│       │   │   ├── layers/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── activation.py
│       │   │   │   ├── attention/
│       │   │   │   │   ├── STA_configuration.py
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── backends/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── aiter.py
│       │   │   │   │   │   ├── aiter_sage.py
│       │   │   │   │   │   ├── attention_backend.py
│       │   │   │   │   │   ├── flash_attn.py
│       │   │   │   │   │   ├── flash_attn_2.py
│       │   │   │   │   │   ├── sage_attn.py
│       │   │   │   │   │   ├── sage_attn3.py
│       │   │   │   │   │   ├── sdpa.py
│       │   │   │   │   │   ├── sliding_tile_attn.py
│       │   │   │   │   │   ├── sparse_linear_attn.py
│       │   │   │   │   │   ├── sparse_video_gen_2_attn.py
│       │   │   │   │   │   ├── video_sparse_attn.py
│       │   │   │   │   │   └── vmoba.py
│       │   │   │   │   ├── layer.py
│       │   │   │   │   ├── selector.py
│       │   │   │   │   └── turbo_layer.py
│       │   │   │   ├── custom_op.py
│       │   │   │   ├── elementwise.py
│       │   │   │   ├── layernorm.py
│       │   │   │   ├── linear.py
│       │   │   │   ├── lora/
│       │   │   │   │   └── linear.py
│       │   │   │   ├── mlp.py
│       │   │   │   ├── quantization/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── configs/
│       │   │   │   │   │   ├── base_config.py
│       │   │   │   │   │   └── nunchaku_config.py
│       │   │   │   │   ├── fp8.py
│       │   │   │   │   ├── modelslim.py
│       │   │   │   │   └── nunchaku_linear.py
│       │   │   │   ├── rotary_embedding/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── factory.py
│       │   │   │   │   ├── mrope.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── usp.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── visual_embedding.py
│       │   │   │   └── vocab_parallel_embedding.py
│       │   │   ├── loader/
│       │   │   │   ├── component_loaders/
│       │   │   │   │   ├── adapter_loader.py
│       │   │   │   │   ├── bridge_loader.py
│       │   │   │   │   ├── component_loader.py
│       │   │   │   │   ├── image_encoder_loader.py
│       │   │   │   │   ├── scheduler_loader.py
│       │   │   │   │   ├── text_encoder_loader.py
│       │   │   │   │   ├── transformer_loader.py
│       │   │   │   │   ├── vae_loader.py
│       │   │   │   │   ├── vl_encoder_loader.py
│       │   │   │   │   └── vocoder_loader.py
│       │   │   │   ├── fsdp_load.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── weight_utils.py
│       │   │   │   └── weights_updater.py
│       │   │   ├── managers/
│       │   │   │   ├── forward_context.py
│       │   │   │   ├── gpu_worker.py
│       │   │   │   └── scheduler.py
│       │   │   ├── models/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── adapter/
│       │   │   │   │   └── ltx_2_connector.py
│       │   │   │   ├── bridges/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── mova_dual_tower.py
│       │   │   │   ├── dits/
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── causal_wanvideo.py
│       │   │   │   │   ├── flux.py
│       │   │   │   │   ├── flux_2.py
│       │   │   │   │   ├── glm_image.py
│       │   │   │   │   ├── helios.py
│       │   │   │   │   ├── hunyuan3d.py
│       │   │   │   │   ├── hunyuanvideo.py
│       │   │   │   │   ├── ltx_2.py
│       │   │   │   │   ├── mova_audio_dit.py
│       │   │   │   │   ├── mova_video_dit.py
│       │   │   │   │   ├── qwen_image.py
│       │   │   │   │   ├── sana.py
│       │   │   │   │   ├── wanvideo.py
│       │   │   │   │   └── zimage.py
│       │   │   │   ├── encoders/
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── bert.py
│       │   │   │   │   ├── clip.py
│       │   │   │   │   ├── gemma2.py
│       │   │   │   │   ├── gemma_3.py
│       │   │   │   │   ├── hunyuan3d.py
│       │   │   │   │   ├── llama.py
│       │   │   │   │   ├── mistral_3.py
│       │   │   │   │   ├── qwen2_5vl.py
│       │   │   │   │   ├── qwen3.py
│       │   │   │   │   ├── t5.py
│       │   │   │   │   └── vision.py
│       │   │   │   ├── parameter.py
│       │   │   │   ├── registry.py
│       │   │   │   ├── schedulers/
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── flow_match_pair.py
│       │   │   │   │   ├── hunyuan3d_scheduler.py
│       │   │   │   │   ├── scheduling_comfyui_passthrough.py
│       │   │   │   │   ├── scheduling_dpm_solver_multistep.py
│       │   │   │   │   ├── scheduling_flow_match_euler_discrete.py
│       │   │   │   │   ├── scheduling_flow_unipc_multistep.py
│       │   │   │   │   ├── scheduling_helios.py
│       │   │   │   │   ├── scheduling_self_forcing_flow_match.py
│       │   │   │   │   └── scheduling_unipc_multistep.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── vaes/
│       │   │   │   │   ├── autoencoder.py
│       │   │   │   │   ├── autoencoder_dc.py
│       │   │   │   │   ├── autoencoder_kl_flux2.py
│       │   │   │   │   ├── autoencoder_kl_qwenimage.py
│       │   │   │   │   ├── common.py
│       │   │   │   │   ├── dac.py
│       │   │   │   │   ├── hunyuan3d_vae.py
│       │   │   │   │   ├── hunyuanvae.py
│       │   │   │   │   ├── ltx_2_audio.py
│       │   │   │   │   ├── ltx_2_vae.py
│       │   │   │   │   ├── parallel/
│       │   │   │   │   │   ├── wan_common_utils.py
│       │   │   │   │   │   └── wan_dist_utils.py
│       │   │   │   │   └── wanvae.py
│       │   │   │   ├── vision_utils.py
│       │   │   │   └── vocoder/
│       │   │   │       └── ltx_2_vocoder.py
│       │   │   ├── pipelines/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── comfyui_flux_pipeline.py
│       │   │   │   ├── comfyui_qwen_image_pipeline.py
│       │   │   │   ├── comfyui_zimage_pipeline.py
│       │   │   │   ├── diffusers_pipeline.py
│       │   │   │   ├── flux.py
│       │   │   │   ├── flux_2.py
│       │   │   │   ├── flux_2_klein.py
│       │   │   │   ├── glm_image.py
│       │   │   │   ├── helios_pipeline.py
│       │   │   │   ├── hunyuan3d_pipeline.py
│       │   │   │   ├── hunyuan_pipeline.py
│       │   │   │   ├── ltx_2_pipeline.py
│       │   │   │   ├── mova_pipeline.py
│       │   │   │   ├── qwen_image.py
│       │   │   │   ├── sana.py
│       │   │   │   ├── wan_causal_dmd_pipeline.py
│       │   │   │   ├── wan_dmd_pipeline.py
│       │   │   │   ├── wan_i2v_dmd_pipeline.py
│       │   │   │   ├── wan_i2v_pipeline.py
│       │   │   │   ├── wan_pipeline.py
│       │   │   │   └── zimage_pipeline.py
│       │   │   ├── pipelines_core/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── composed_pipeline_base.py
│       │   │   │   ├── executors/
│       │   │   │   │   ├── parallel_executor.py
│       │   │   │   │   ├── pipeline_executor.py
│       │   │   │   │   └── sync_executor.py
│       │   │   │   ├── lora_format_adapter.py
│       │   │   │   ├── lora_pipeline.py
│       │   │   │   ├── schedule_batch.py
│       │   │   │   └── stages/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── base.py
│       │   │   │       ├── causal_denoising.py
│       │   │   │       ├── comfyui_latent_preparation.py
│       │   │   │       ├── decoding.py
│       │   │   │       ├── decoding_av.py
│       │   │   │       ├── denoising.py
│       │   │   │       ├── denoising_av.py
│       │   │   │       ├── denoising_dmd.py
│       │   │   │       ├── encoding.py
│       │   │   │       ├── hunyuan3d_paint.py
│       │   │   │       ├── hunyuan3d_shape.py
│       │   │   │       ├── image_encoding.py
│       │   │   │       ├── input_validation.py
│       │   │   │       ├── latent_preparation.py
│       │   │   │       ├── latent_preparation_av.py
│       │   │   │       ├── model_specific_stages/
│       │   │   │       │   ├── glm_image.py
│       │   │   │       │   ├── helios_decoding.py
│       │   │   │       │   ├── helios_denoising.py
│       │   │   │       │   ├── mova.py
│       │   │   │       │   └── qwen_image_layered.py
│       │   │   │       ├── text_connector.py
│       │   │   │       ├── text_encoding.py
│       │   │   │       ├── timestep_preparation.py
│       │   │   │       └── validators.py
│       │   │   ├── platforms/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cpu.py
│       │   │   │   ├── cuda.py
│       │   │   │   ├── interface.py
│       │   │   │   ├── mps.py
│       │   │   │   ├── musa.py
│       │   │   │   ├── npu.py
│       │   │   │   └── rocm.py
│       │   │   ├── postprocess/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── realesrgan_upscaler.py
│       │   │   │   └── rife_interpolator.py
│       │   │   ├── scheduler_client.py
│       │   │   ├── server_args.py
│       │   │   └── utils/
│       │   │       ├── common.py
│       │   │       ├── distributed.py
│       │   │       ├── hf_diffusers_utils.py
│       │   │       ├── layerwise_offload.py
│       │   │       ├── logging_utils.py
│       │   │       ├── mesh3d_utils.py
│       │   │       ├── perf_logger.py
│       │   │       ├── profiler.py
│       │   │       └── quantization_utils.py
│       │   ├── test/
│       │   │   ├── __init__.py
│       │   │   ├── cli/
│       │   │   │   ├── test_generate_common.py
│       │   │   │   ├── test_generate_i2i.py
│       │   │   │   └── test_generate_t2i_perf.py
│       │   │   ├── run_suite.py
│       │   │   ├── scripts/
│       │   │   │   ├── gen_diffusion_ci_outputs.py
│       │   │   │   └── gen_perf_baselines.py
│       │   │   ├── server/
│       │   │   │   ├── ascend/
│       │   │   │   │   ├── perf_baselines_npu.json
│       │   │   │   │   ├── test_server_1_npu.py
│       │   │   │   │   ├── test_server_2_npu.py
│       │   │   │   │   ├── test_server_8_npu.py
│       │   │   │   │   └── testcase_configs_npu.py
│       │   │   │   ├── conftest.py
│       │   │   │   ├── perf_baselines.json
│       │   │   │   ├── test_server_2_gpu_a.py
│       │   │   │   ├── test_server_2_gpu_b.py
│       │   │   │   ├── test_server_a.py
│       │   │   │   ├── test_server_b.py
│       │   │   │   ├── test_server_common.py
│       │   │   │   ├── test_server_utils.py
│       │   │   │   ├── test_update_weights_from_disk.py
│       │   │   │   └── testcase_configs.py
│       │   │   ├── slack_utils.py
│       │   │   ├── test_files/
│       │   │   │   ├── launch_flux.json
│       │   │   │   └── launch_wan.json
│       │   │   ├── test_utils.py
│       │   │   └── unit/
│       │   │       ├── test_lora_format_adapter.py
│       │   │       ├── test_sampling_params.py
│       │   │       ├── test_server_args.py
│       │   │       └── test_storage.py
│       │   ├── third_party/
│       │   │   ├── __init__.py
│       │   │   └── pynvml.py
│       │   ├── tools/
│       │   │   ├── convert_hf_to_fp8.py
│       │   │   └── wan_repack.py
│       │   └── utils.py
│       ├── profiler.py
│       ├── srt/
│       │   ├── batch_invariant_ops/
│       │   │   ├── __init__.py
│       │   │   └── batch_invariant_ops.py
│       │   ├── batch_overlap/
│       │   │   ├── operations.py
│       │   │   ├── operations_strategy.py
│       │   │   ├── single_batch_overlap.py
│       │   │   └── two_batch_overlap.py
│       │   ├── checkpoint_engine/
│       │   │   ├── __init__.py
│       │   │   ├── checkpoint_engine_worker.py
│       │   │   └── update.py
│       │   ├── compilation/
│       │   │   ├── backend.py
│       │   │   ├── compilation_config.py
│       │   │   ├── compilation_counter.py
│       │   │   ├── compile.py
│       │   │   ├── compiler_interface.py
│       │   │   ├── cuda_piecewise_backend.py
│       │   │   ├── fix_functionalization.py
│       │   │   ├── fx_utils.py
│       │   │   ├── inductor_pass.py
│       │   │   ├── npu_piecewise_backend.py
│       │   │   ├── pass_manager.py
│       │   │   ├── piecewise_context_manager.py
│       │   │   └── weak_ref_tensor.py
│       │   ├── configs/
│       │   │   ├── __init__.py
│       │   │   ├── afmoe.py
│       │   │   ├── bailing_hybrid.py
│       │   │   ├── chatglm.py
│       │   │   ├── dbrx.py
│       │   │   ├── deepseek_ocr.py
│       │   │   ├── deepseekvl2.py
│       │   │   ├── device_config.py
│       │   │   ├── dots_ocr.py
│       │   │   ├── dots_vlm.py
│       │   │   ├── exaone.py
│       │   │   ├── falcon_h1.py
│       │   │   ├── granitemoehybrid.py
│       │   │   ├── internvl.py
│       │   │   ├── janus_pro.py
│       │   │   ├── jet_nemotron.py
│       │   │   ├── jet_vlm.py
│       │   │   ├── kimi_k25.py
│       │   │   ├── kimi_linear.py
│       │   │   ├── kimi_vl.py
│       │   │   ├── kimi_vl_moonvit.py
│       │   │   ├── lfm2.py
│       │   │   ├── lfm2_moe.py
│       │   │   ├── load_config.py
│       │   │   ├── longcat_flash.py
│       │   │   ├── mamba_utils.py
│       │   │   ├── model_config.py
│       │   │   ├── modelopt_config.py
│       │   │   ├── nano_nemotron_vl.py
│       │   │   ├── nemotron_h.py
│       │   │   ├── olmo3.py
│       │   │   ├── points_v15_chat.py
│       │   │   ├── qwen3_5.py
│       │   │   ├── qwen3_next.py
│       │   │   ├── qwen3_omni.py
│       │   │   ├── qwen3_vl.py
│       │   │   ├── radio.py
│       │   │   ├── step3_vl.py
│       │   │   ├── step3p5.py
│       │   │   ├── update_config.py
│       │   │   └── utils.py
│       │   ├── connector/
│       │   │   ├── __init__.py
│       │   │   ├── base_connector.py
│       │   │   ├── redis.py
│       │   │   ├── remote_instance.py
│       │   │   ├── s3.py
│       │   │   ├── serde/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── safe_serde.py
│       │   │   │   └── serde.py
│       │   │   └── utils.py
│       │   ├── constants.py
│       │   ├── constrained/
│       │   │   ├── base_grammar_backend.py
│       │   │   ├── grammar_manager.py
│       │   │   ├── llguidance_backend.py
│       │   │   ├── outlines_backend.py
│       │   │   ├── outlines_jump_forward.py
│       │   │   ├── reasoner_grammar_backend.py
│       │   │   ├── triton_ops/
│       │   │   │   └── bitmask_ops.py
│       │   │   ├── utils.py
│       │   │   └── xgrammar_backend.py
│       │   ├── debug_utils/
│       │   │   ├── __init__.py
│       │   │   ├── comparator/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── __main__.py
│       │   │   │   ├── aligner/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── axis_aligner.py
│       │   │   │   │   ├── entrypoint/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── executor.py
│       │   │   │   │   │   ├── planner.py
│       │   │   │   │   │   ├── traced_types.py
│       │   │   │   │   │   └── types.py
│       │   │   │   │   ├── reorderer/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── executor.py
│       │   │   │   │   │   ├── planner.py
│       │   │   │   │   │   └── types.py
│       │   │   │   │   ├── token_aligner/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── concat_steps/
│       │   │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   │   ├── executor.py
│       │   │   │   │   │   │   └── thd_seq_lens_loader.py
│       │   │   │   │   │   ├── entrypoint.py
│       │   │   │   │   │   └── smart/
│       │   │   │   │   │       ├── __init__.py
│       │   │   │   │   │       ├── aux_loader.py
│       │   │   │   │   │       ├── aux_plugins.py
│       │   │   │   │   │       ├── executor.py
│       │   │   │   │   │       ├── planner.py
│       │   │   │   │   │       ├── seq_info_builder.py
│       │   │   │   │   │       └── types.py
│       │   │   │   │   └── unsharder/
│       │   │   │   │       ├── __init__.py
│       │   │   │   │       ├── executor.py
│       │   │   │   │       ├── parallel_info.py
│       │   │   │   │       ├── planner.py
│       │   │   │   │       └── types.py
│       │   │   │   ├── bundle_comparator.py
│       │   │   │   ├── bundle_matcher.py
│       │   │   │   ├── dims_spec/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── comment_parser.py
│       │   │   │   │   ├── dim_parser.py
│       │   │   │   │   ├── dims_parser.py
│       │   │   │   │   ├── modifier_parser.py
│       │   │   │   │   ├── tensor_naming.py
│       │   │   │   │   └── types.py
│       │   │   │   ├── display.py
│       │   │   │   ├── dp_utils.py
│       │   │   │   ├── entrypoint.py
│       │   │   │   ├── log_sink.py
│       │   │   │   ├── meta_overrider.py
│       │   │   │   ├── output_formatter.py
│       │   │   │   ├── output_types.py
│       │   │   │   ├── per_token_visualizer.py
│       │   │   │   ├── preset.py
│       │   │   │   ├── report_sink.py
│       │   │   │   ├── tensor_comparator/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── comparator.py
│       │   │   │   │   ├── formatter.py
│       │   │   │   │   └── types.py
│       │   │   │   ├── utils.py
│       │   │   │   └── visualizer/
│       │   │   │       ├── __init__.py
│       │   │   │       ├── figure.py
│       │   │   │       ├── panels.py
│       │   │   │       └── preprocessing.py
│       │   │   ├── cuda_coredump.py
│       │   │   ├── dump_comparator.py
│       │   │   ├── dump_loader.py
│       │   │   ├── dumper.py
│       │   │   ├── log_parser.py
│       │   │   ├── model_truncator.py
│       │   │   ├── schedule_simulator/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── __main__.py
│       │   │   │   ├── data_source/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── data_loader.py
│       │   │   │   │   └── data_synthesis.py
│       │   │   │   ├── entrypoint.py
│       │   │   │   ├── gpu_state.py
│       │   │   │   ├── metrics.py
│       │   │   │   ├── request.py
│       │   │   │   ├── routers/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── random_router.py
│       │   │   │   │   ├── round_robin_router.py
│       │   │   │   │   └── sticky_router.py
│       │   │   │   ├── schedulers/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   └── fifo_scheduler.py
│       │   │   │   └── simulator.py
│       │   │   ├── source_patcher/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── code_patcher.py
│       │   │   │   ├── source_editor.py
│       │   │   │   └── types.py
│       │   │   ├── tensor_dump_forward_hook.py
│       │   │   └── text_comparator.py
│       │   ├── disaggregation/
│       │   │   ├── ascend/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── conn.py
│       │   │   │   └── transfer_engine.py
│       │   │   ├── base/
│       │   │   │   ├── __init__.py
│       │   │   │   └── conn.py
│       │   │   ├── common/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── conn.py
│       │   │   │   └── utils.py
│       │   │   ├── decode.py
│       │   │   ├── decode_kvcache_offload_manager.py
│       │   │   ├── decode_schedule_batch_mixin.py
│       │   │   ├── encode_grpc_server.py
│       │   │   ├── encode_receiver.py
│       │   │   ├── encode_server.py
│       │   │   ├── fake/
│       │   │   │   ├── __init__.py
│       │   │   │   └── conn.py
│       │   │   ├── kv_events.py
│       │   │   ├── mooncake/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── conn.py
│       │   │   │   └── utils.py
│       │   │   ├── mori/
│       │   │   │   ├── __init__.py
│       │   │   │   └── conn.py
│       │   │   ├── nixl/
│       │   │   │   ├── __init__.py
│       │   │   │   └── conn.py
│       │   │   ├── prefill.py
│       │   │   └── utils.py
│       │   ├── distributed/
│       │   │   ├── __init__.py
│       │   │   ├── communication_op.py
│       │   │   ├── device_communicators/
│       │   │   │   ├── all_reduce_utils.py
│       │   │   │   ├── cuda_wrapper.py
│       │   │   │   ├── custom_all_reduce.py
│       │   │   │   ├── custom_all_reduce_ops.py
│       │   │   │   ├── custom_all_reduce_utils.py
│       │   │   │   ├── hpu_communicator.py
│       │   │   │   ├── mooncake_transfer_engine.py
│       │   │   │   ├── npu_communicator.py
│       │   │   │   ├── pymscclpp.py
│       │   │   │   ├── pynccl.py
│       │   │   │   ├── pynccl_allocator.py
│       │   │   │   ├── pynccl_wrapper.py
│       │   │   │   ├── quick_all_reduce.py
│       │   │   │   ├── shm_broadcast.py
│       │   │   │   ├── torch_symm_mem.py
│       │   │   │   └── xpu_communicator.py
│       │   │   ├── naive_distributed.py
│       │   │   ├── parallel_state.py
│       │   │   └── utils.py
│       │   ├── dllm/
│       │   │   ├── algorithm/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── joint_threshold.py
│       │   │   │   └── low_confidence.py
│       │   │   ├── config.py
│       │   │   └── mixin/
│       │   │       ├── req.py
│       │   │       └── scheduler.py
│       │   ├── elastic_ep/
│       │   │   ├── elastic_ep.py
│       │   │   ├── expert_backup_client.py
│       │   │   └── expert_backup_manager.py
│       │   ├── entrypoints/
│       │   │   ├── EngineBase.py
│       │   │   ├── anthropic/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── protocol.py
│       │   │   │   └── serving.py
│       │   │   ├── context.py
│       │   │   ├── engine.py
│       │   │   ├── grpc_server.py
│       │   │   ├── harmony_utils.py
│       │   │   ├── http_server.py
│       │   │   ├── http_server_engine.py
│       │   │   ├── ollama/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── protocol.py
│       │   │   │   ├── serving.py
│       │   │   │   └── smart_router.py
│       │   │   ├── openai/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── encoding_dsv32.py
│       │   │   │   ├── protocol.py
│       │   │   │   ├── serving_base.py
│       │   │   │   ├── serving_chat.py
│       │   │   │   ├── serving_classify.py
│       │   │   │   ├── serving_completions.py
│       │   │   │   ├── serving_embedding.py
│       │   │   │   ├── serving_rerank.py
│       │   │   │   ├── serving_responses.py
│       │   │   │   ├── serving_score.py
│       │   │   │   ├── serving_tokenize.py
│       │   │   │   ├── serving_transcription.py
│       │   │   │   ├── tool_server.py
│       │   │   │   ├── usage_processor.py
│       │   │   │   └── utils.py
│       │   │   ├── ssl_utils.py
│       │   │   ├── tool.py
│       │   │   ├── v1_loads.py
│       │   │   └── warmup.py
│       │   ├── environ.py
│       │   ├── eplb/
│       │   │   ├── __init__.py
│       │   │   ├── eplb_algorithms/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── deepseek.py
│       │   │   │   ├── deepseek_vec.py
│       │   │   │   └── elasticity_aware.py
│       │   │   ├── eplb_manager.py
│       │   │   ├── eplb_simulator/
│       │   │   │   ├── __init__.py
│       │   │   │   └── reader.py
│       │   │   ├── expert_distribution.py
│       │   │   ├── expert_location.py
│       │   │   ├── expert_location_dispatch.py
│       │   │   └── expert_location_updater.py
│       │   ├── function_call/
│       │   │   ├── base_format_detector.py
│       │   │   ├── core_types.py
│       │   │   ├── deepseekv31_detector.py
│       │   │   ├── deepseekv32_detector.py
│       │   │   ├── deepseekv3_detector.py
│       │   │   ├── function_call_parser.py
│       │   │   ├── gigachat3_detector.py
│       │   │   ├── glm47_moe_detector.py
│       │   │   ├── glm4_moe_detector.py
│       │   │   ├── gpt_oss_detector.py
│       │   │   ├── hermes_detector.py
│       │   │   ├── internlm_detector.py
│       │   │   ├── json_array_parser.py
│       │   │   ├── kimik2_detector.py
│       │   │   ├── lfm2_detector.py
│       │   │   ├── llama32_detector.py
│       │   │   ├── mimo_detector.py
│       │   │   ├── minimax_m2.py
│       │   │   ├── mistral_detector.py
│       │   │   ├── pythonic_detector.py
│       │   │   ├── qwen25_detector.py
│       │   │   ├── qwen3_coder_detector.py
│       │   │   ├── step3_detector.py
│       │   │   ├── trinity_detector.py
│       │   │   └── utils.py
│       │   ├── grpc/
│       │   │   └── __init__.py
│       │   ├── hardware_backend/
│       │   │   └── npu/
│       │   │       ├── allocator_npu.py
│       │   │       ├── attention/
│       │   │       │   ├── ascend_backend.py
│       │   │       │   ├── ascend_torch_native_backend.py
│       │   │       │   └── mla_preprocess.py
│       │   │       ├── cmo.py
│       │   │       ├── graph_runner/
│       │   │       │   ├── eagle_draft_extend_npu_graph_runner.py
│       │   │       │   ├── eagle_draft_npu_graph_runner.py
│       │   │       │   ├── npu_graph_runner.py
│       │   │       │   └── vit_npu_graph_runner.py
│       │   │       ├── memory_pool_npu.py
│       │   │       ├── modules/
│       │   │       │   ├── deepseek_v2_attention_mla_npu.py
│       │   │       │   └── qwen_vl_processor.py
│       │   │       ├── moe/
│       │   │       │   └── topk.py
│       │   │       ├── quantization/
│       │   │       │   ├── fused_moe_method_npu.py
│       │   │       │   └── linear_method_npu.py
│       │   │       └── utils.py
│       │   ├── layers/
│       │   │   ├── activation.py
│       │   │   ├── amx_utils.py
│       │   │   ├── attention/
│       │   │   │   ├── aiter_backend.py
│       │   │   │   ├── attention_registry.py
│       │   │   │   ├── base_attn_backend.py
│       │   │   │   ├── cutlass_mla_backend.py
│       │   │   │   ├── double_sparsity_backend.py
│       │   │   │   ├── dual_chunk_flashattention_backend.py
│       │   │   │   ├── fla/
│       │   │   │   │   ├── chunk.py
│       │   │   │   │   ├── chunk_delta_h.py
│       │   │   │   │   ├── chunk_o.py
│       │   │   │   │   ├── chunk_scaled_dot_kkt.py
│       │   │   │   │   ├── cumsum.py
│       │   │   │   │   ├── fused_gdn_gating.py
│       │   │   │   │   ├── fused_norm_gate.py
│       │   │   │   │   ├── fused_recurrent.py
│       │   │   │   │   ├── fused_sigmoid_gating_recurrent.py
│       │   │   │   │   ├── index.py
│       │   │   │   │   ├── kda.py
│       │   │   │   │   ├── l2norm.py
│       │   │   │   │   ├── layernorm_gated.py
│       │   │   │   │   ├── op.py
│       │   │   │   │   ├── solve_tril.py
│       │   │   │   │   ├── utils.py
│       │   │   │   │   └── wy_fast.py
│       │   │   │   ├── flashattention_backend.py
│       │   │   │   ├── flashinfer_backend.py
│       │   │   │   ├── flashinfer_mla_backend.py
│       │   │   │   ├── flashmla_backend.py
│       │   │   │   ├── hybrid_attn_backend.py
│       │   │   │   ├── hybrid_linear_attn_backend.py
│       │   │   │   ├── intel_amx_backend.py
│       │   │   │   ├── linear/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── gdn_backend.py
│       │   │   │   │   ├── kda_backend.py
│       │   │   │   │   ├── kernels/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── gdn_cutedsl.py
│       │   │   │   │   │   ├── gdn_flashinfer.py
│       │   │   │   │   │   ├── gdn_triton.py
│       │   │   │   │   │   ├── kda_triton.py
│       │   │   │   │   │   └── kernel_backend.py
│       │   │   │   │   ├── lightning_attn.py
│       │   │   │   │   ├── lightning_backend.py
│       │   │   │   │   ├── linear_metadata.py
│       │   │   │   │   ├── seg_la.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── mamba/
│       │   │   │   │   ├── causal_conv1d.py
│       │   │   │   │   ├── causal_conv1d_triton.py
│       │   │   │   │   ├── mamba.py
│       │   │   │   │   ├── mamba2_metadata.py
│       │   │   │   │   ├── mamba_state_scatter_triton.py
│       │   │   │   │   ├── mixer2_rms_norm_gated.py
│       │   │   │   │   └── ops/
│       │   │   │   │       ├── __init__.py
│       │   │   │   │       ├── layernorm_gated.py
│       │   │   │   │       ├── mamba_ssm.py
│       │   │   │   │       ├── ssd_bmm.py
│       │   │   │   │       ├── ssd_chunk_scan.py
│       │   │   │   │       ├── ssd_chunk_state.py
│       │   │   │   │       ├── ssd_combined.py
│       │   │   │   │       ├── ssd_state_passing.py
│       │   │   │   │       └── ssu_dispatch.py
│       │   │   │   ├── merge_state.py
│       │   │   │   ├── nsa/
│       │   │   │   │   ├── dequant_k_cache.py
│       │   │   │   │   ├── index_buf_accessor.py
│       │   │   │   │   ├── nsa_backend_mtp_precompute.py
│       │   │   │   │   ├── nsa_indexer.py
│       │   │   │   │   ├── nsa_mtp_verification.py
│       │   │   │   │   ├── quant_k_cache.py
│       │   │   │   │   ├── tilelang_kernel.py
│       │   │   │   │   ├── transform_index.py
│       │   │   │   │   ├── triton_kernel.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── nsa_backend.py
│       │   │   │   ├── tbo_backend.py
│       │   │   │   ├── torch_flex_backend.py
│       │   │   │   ├── torch_native_backend.py
│       │   │   │   ├── triton_backend.py
│       │   │   │   ├── triton_ops/
│       │   │   │   │   ├── decode_attention.py
│       │   │   │   │   ├── double_sparsity_attention.py
│       │   │   │   │   ├── extend_attention.py
│       │   │   │   │   ├── merge_state.py
│       │   │   │   │   ├── prefill_attention.py
│       │   │   │   │   ├── rocm_mla_decode_rope.py
│       │   │   │   │   └── trtllm_fp8_kv_kernel.py
│       │   │   │   ├── trtllm_mha_backend.py
│       │   │   │   ├── trtllm_mla_backend.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── vision.py
│       │   │   │   ├── vision_utils.py
│       │   │   │   ├── wave_backend.py
│       │   │   │   ├── wave_ops/
│       │   │   │   │   ├── decode_attention.py
│       │   │   │   │   ├── extend_attention.py
│       │   │   │   │   └── prefill_attention.py
│       │   │   │   └── xpu_backend.py
│       │   │   ├── communicator.py
│       │   │   ├── communicator_nsa_cp.py
│       │   │   ├── conv.py
│       │   │   ├── deep_gemm_wrapper/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── compile_utils.py
│       │   │   │   ├── configurer.py
│       │   │   │   └── entrypoint.py
│       │   │   ├── dp_attention.py
│       │   │   ├── elementwise.py
│       │   │   ├── flashinfer_comm_fusion.py
│       │   │   ├── int4fp8_utils.py
│       │   │   ├── layernorm.py
│       │   │   ├── linear.py
│       │   │   ├── logits_processor.py
│       │   │   ├── model_parallel.py
│       │   │   ├── modelopt_utils.py
│       │   │   ├── moe/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── cutlass_moe.py
│       │   │   │   ├── cutlass_moe_params.py
│       │   │   │   ├── cutlass_w4a8_moe.py
│       │   │   │   ├── ep_moe/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── kernels.py
│       │   │   │   │   └── layer.py
│       │   │   │   ├── flashinfer_cutedsl_moe.py
│       │   │   │   ├── flashinfer_trtllm_moe.py
│       │   │   │   ├── fused_moe_native.py
│       │   │   │   ├── fused_moe_triton/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── configs/
│       │   │   │   │   │   ├── README.md
│       │   │   │   │   │   ├── triton_3_1_0/
│       │   │   │   │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=16,N=1024,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
│       │   │   │   │   │   │   ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│       │   │   │   │   │   │   ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=1280,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=2560,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=320,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=64,N=640,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=AMD_Radeon_Graphics.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=14336,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=AMD_Radeon_Graphics.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=1792,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=2048,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=AMD_Radeon_Graphics.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=3584,device_name=NVIDIA_L40S.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=4096,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=AMD_Radeon_Graphics.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=7168,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   └── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   ├── triton_3_2_0/
│       │   │   │   │   │   │   ├── E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=128,N=192,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=128,N=192,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=128,N=96,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│       │   │   │   │   │   │   ├── E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│       │   │   │   │   │   │   └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   ├── triton_3_3_0/
│       │   │   │   │   │   │   └── E=16,N=1024,device_name=NVIDIA_B200.json
│       │   │   │   │   │   ├── triton_3_3_1/
│       │   │   │   │   │   │   ├── E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=768,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=160,N=320,device_name=NVIDIA_H20-3e.json
│       │   │   │   │   │   │   ├── E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   ├── triton_3_4_0/
│       │   │   │   │   │   │   ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=128,N=1856,device_name=NVIDIA_L40S.json
│       │   │   │   │   │   │   ├── E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=128,N=928,device_name=NVIDIA_L40S.json
│       │   │   │   │   │   │   ├── E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │   │   ├── E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_B200.json
│       │   │   │   │   │   │   ├── E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=256,N=512,device_name=NVIDIA_B200.json
│       │   │   │   │   │   │   ├── E=256,N=512,device_name=NVIDIA_H20.json
│       │   │   │   │   │   │   ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│       │   │   │   │   │   │   ├── E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
│       │   │   │   │   │   │   ├── E=384,N=128,device_name=,dtype=int4_w4a16.json
│       │   │   │   │   │   │   ├── E=384,N=128,device_name=,dtype=int4_w4a16_down.json
│       │   │   │   │   │   │   ├── E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   ├── E=512,N=128,device_name=NVIDIA_H20-3e.json
│       │   │   │   │   │   │   ├── E=512,N=128,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │   │   ├── E=512,N=256,device_name=NVIDIA_B200.json
│       │   │   │   │   │   │   ├── E=512,N=256,device_name=NVIDIA_H20-3e.json
│       │   │   │   │   │   │   ├── E=512,N=256,device_name=NVIDIA_H200.json
│       │   │   │   │   │   │   ├── E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │   │   └── E=512,N=64,device_name=NVIDIA_H200.json
│       │   │   │   │   │   └── triton_3_5_1/
│       │   │   │   │   │       ├── E=128,N=1344,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=128,N=1856,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=128,N=232,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=128,N=2688,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=128,N=464,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=128,N=928,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=16,N=1856,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=16,N=2048,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_H200.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json
│       │   │   │   │   │       ├── E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=20,N=1536,device_name=NVIDIA_H200.json
│       │   │   │   │   │       ├── E=256,N=1344,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=256,N=2688,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=256,N=672,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│       │   │   │   │   │       ├── E=32,N=1856,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=32,N=928,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│       │   │   │   │   │       ├── E=512,N=128,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=128,device_name=NVIDIA_H200.json
│       │   │   │   │   │       ├── E=512,N=1344,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=512,N=256,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       ├── E=512,N=256,device_name=NVIDIA_H200.json
│       │   │   │   │   │       ├── E=512,N=2688,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=512,N=336,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=512,N=672,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=64,N=1856,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=64,N=2688,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=64,N=464,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=64,N=928,device_name=NVIDIA_B200.json
│       │   │   │   │   │       ├── E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│       │   │   │   │   │       ├── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   │       └── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│       │   │   │   │   ├── fused_marlin_moe.py
│       │   │   │   │   ├── fused_moe.py
│       │   │   │   │   ├── fused_moe_triton_config.py
│       │   │   │   │   ├── fused_moe_triton_kernels.py
│       │   │   │   │   ├── layer.py
│       │   │   │   │   ├── moe_align_block_size.py
│       │   │   │   │   └── triton_kernels_moe.py
│       │   │   │   ├── kt_ep_wrapper.py
│       │   │   │   ├── moe_runner/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── deep_gemm.py
│       │   │   │   │   ├── flashinfer_trtllm.py
│       │   │   │   │   ├── marlin.py
│       │   │   │   │   ├── runner.py
│       │   │   │   │   ├── triton.py
│       │   │   │   │   └── triton_kernels.py
│       │   │   │   ├── rocm_moe_utils.py
│       │   │   │   ├── routed_experts_capturer.py
│       │   │   │   ├── router.py
│       │   │   │   ├── token_dispatcher/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base.py
│       │   │   │   │   ├── deepep.py
│       │   │   │   │   ├── flashinfer.py
│       │   │   │   │   ├── flashinfer_utils.py
│       │   │   │   │   ├── fuseep.py
│       │   │   │   │   ├── mooncake.py
│       │   │   │   │   ├── moriep.py
│       │   │   │   │   ├── nixl.py
│       │   │   │   │   └── standard.py
│       │   │   │   ├── topk.py
│       │   │   │   └── utils.py
│       │   │   ├── multimodal.py
│       │   │   ├── n_gram_embedding.py
│       │   │   ├── parameter.py
│       │   │   ├── pooler.py
│       │   │   ├── quantization/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── auto_round.py
│       │   │   │   ├── awq.py
│       │   │   │   ├── awq_triton.py
│       │   │   │   ├── base_config.py
│       │   │   │   ├── base_scheme.py
│       │   │   │   ├── bitsandbytes.py
│       │   │   │   ├── blockwise_int8.py
│       │   │   │   ├── compressed_tensors/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── compressed_tensors.py
│       │   │   │   │   ├── schemes/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── compressed_tensors_scheme.py
│       │   │   │   │   │   ├── compressed_tensors_w4a4_mxint4_moe.py
│       │   │   │   │   │   ├── compressed_tensors_w4a4_nvfp4.py
│       │   │   │   │   │   ├── compressed_tensors_w4a4_nvfp4_moe.py
│       │   │   │   │   │   ├── compressed_tensors_w4a8_int8_moe.py
│       │   │   │   │   │   ├── compressed_tensors_w8a16_fp8.py
│       │   │   │   │   │   ├── compressed_tensors_w8a8_fp8.py
│       │   │   │   │   │   ├── compressed_tensors_w8a8_fp8_moe.py
│       │   │   │   │   │   ├── compressed_tensors_w8a8_int8.py
│       │   │   │   │   │   ├── compressed_tensors_w8a8_int8_moe.py
│       │   │   │   │   │   ├── compressed_tensors_wNa16.py
│       │   │   │   │   │   └── compressed_tensors_wNa16_moe.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── configs/
│       │   │   │   │   ├── N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=4096,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=5120,K=2048,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│       │   │   │   │   └── README.md
│       │   │   │   ├── fp4_utils.py
│       │   │   │   ├── fp8.py
│       │   │   │   ├── fp8_kernel.py
│       │   │   │   ├── fp8_utils.py
│       │   │   │   ├── fpgemm_fp8.py
│       │   │   │   ├── gguf.py
│       │   │   │   ├── gptq.py
│       │   │   │   ├── int8_kernel.py
│       │   │   │   ├── int8_utils.py
│       │   │   │   ├── kv_cache.py
│       │   │   │   ├── kvfp4_tensor.py
│       │   │   │   ├── marlin_utils.py
│       │   │   │   ├── marlin_utils_fp8.py
│       │   │   │   ├── modelopt_quant.py
│       │   │   │   ├── modelslim/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── modelslim.py
│       │   │   │   │   └── schemes/
│       │   │   │   │       ├── __init__.py
│       │   │   │   │       ├── modelslim_scheme.py
│       │   │   │   │       ├── modelslim_w4a4_int4.py
│       │   │   │   │       ├── modelslim_w4a4_int4_moe.py
│       │   │   │   │       ├── modelslim_w4a8_int8_moe.py
│       │   │   │   │       ├── modelslim_w8a8_int8.py
│       │   │   │   │       └── modelslim_w8a8_int8_moe.py
│       │   │   │   ├── moe_wna16.py
│       │   │   │   ├── mxfp4.py
│       │   │   │   ├── mxfp4_tensor.py
│       │   │   │   ├── petit.py
│       │   │   │   ├── petit_utils.py
│       │   │   │   ├── qoq.py
│       │   │   │   ├── quark/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── quark.py
│       │   │   │   │   ├── schemes/
│       │   │   │   │   │   ├── __init__.py
│       │   │   │   │   │   ├── quark_scheme.py
│       │   │   │   │   │   ├── quark_w4a4_mxfp4.py
│       │   │   │   │   │   ├── quark_w4a4_mxfp4_moe.py
│       │   │   │   │   │   ├── quark_w8a8_fp8.py
│       │   │   │   │   │   └── quark_w8a8_fp8_moe.py
│       │   │   │   │   └── utils.py
│       │   │   │   ├── quark_int4fp8_moe.py
│       │   │   │   ├── rocm_mxfp4_utils.py
│       │   │   │   ├── unquant.py
│       │   │   │   ├── utils.py
│       │   │   │   ├── w4afp8.py
│       │   │   │   ├── w8a8_fp8.py
│       │   │   │   └── w8a8_int8.py
│       │   │   ├── radix_attention.py
│       │   │   ├── radix_linear_attention.py
│       │   │   ├── rocm_linear_utils.py
│       │   │   ├── rotary_embedding/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── base.py
│       │   │   │   ├── factory.py
│       │   │   │   ├── mrope.py
│       │   │   │   ├── mrope_rope_index.py
│       │   │   │   ├── rope_variant.py
│       │   │   │   ├── triton_kernels.py
│       │   │   │   ├── utils.py
│       │   │   │   └── yarn.py
│       │   │   ├── sampler.py
│       │   │   ├── sparse_pooler.py
│       │   │   ├── torchao_utils.py
│       │   │   ├── utils/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── common.py
│       │   │   │   ├── hash.py
│       │   │   │   ├── logprob.py
│       │   │   │   └── multi_platform.py
│       │   │   └── vocab_parallel_embedding.py
│       │   ├── lora/
│       │   │   ├── backend/
│       │   │   │   ├── ascend_backend.py
│       │   │   │   ├── base_backend.py
│       │   │   │   ├── chunked_backend.py
│       │   │   │   ├── lmhead_mixing.py
│       │   │   │   ├── lora_registry.py
│       │   │   │   ├── torch_backend.py
│       │   │   │   └── triton_backend.py
│       │   │   ├── eviction_policy.py
│       │   │   ├── layers.py
│       │   │   ├── lora.py
│       │   │   ├── lora_config.py
│       │   │   ├── lora_manager.py
│       │   │   ├── lora_overlap_loader.py
│       │   │   ├── lora_registry.py
│       │   │   ├── mem_pool.py
│       │   │   ├── torch_ops/
│       │   │   │   ├── __init__.py
│       │   │   │   └── lora_ops.py
│       │   │   ├── triton_ops/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── chunked_embedding_lora_a.py
│       │   │   │   ├── chunked_sgmv_expand.py
│       │   │   │   ├── chunked_sgmv_shrink.py
│       │   │   │   ├── embedding_lora_a.py
│       │   │   │   ├── fused_moe_lora_kernel.py
│       │   │   │   ├── gate_up_lora_b.py
│       │   │   │   ├── qkv_lora_b.py
│       │   │   │   ├── sgemm_lora_a.py
│       │   │   │   └── sgemm_lora_b.py
│       │   │   └── utils.py
│       │   ├── managers/
│       │   │   ├── async_dynamic_batch_tokenizer.py
│       │   │   ├── async_mm_data_processor.py
│       │   │   ├── cache_controller.py
│       │   │   ├── configure_logging.py
│       │   │   ├── data_parallel_controller.py
│       │   │   ├── detokenizer_manager.py
│       │   │   ├── disagg_service.py
│       │   │   ├── io_struct.py
│       │   │   ├── mm_utils.py
│       │   │   ├── multi_tokenizer_mixin.py
│       │   │   ├── multimodal_processor.py
│       │   │   ├── overlap_utils.py
│       │   │   ├── prefill_delayer.py
│       │   │   ├── schedule_batch.py
│       │   │   ├── schedule_policy.py
│       │   │   ├── scheduler.py
│       │   │   ├── scheduler_dp_attn_mixin.py
│       │   │   ├── scheduler_input_blocker.py
│       │   │   ├── scheduler_output_processor_mixin.py
│       │   │   ├── scheduler_pp_mixin.py
│       │   │   ├── scheduler_profiler_mixin.py
│       │   │   ├── scheduler_recv_skipper.py
│       │   │   ├── scheduler_runtime_checker_mixin.py
│       │   │   ├── scheduler_update_weights_mixin.py
│       │   │   ├── session_controller.py
│       │   │   ├── template_manager.py
│       │   │   ├── tokenizer_communicator_mixin.py
│       │   │   ├── tokenizer_manager.py
│       │   │   ├── tokenizer_manager_multiitem_mixin.py
│       │   │   ├── tp_worker.py
│       │   │   └── utils.py
│       │   ├── mem_cache/
│       │   │   ├── allocator.py
│       │   │   ├── base_prefix_cache.py
│       │   │   ├── cache_init_params.py
│       │   │   ├── chunk_cache.py
│       │   │   ├── common.py
│       │   │   ├── cpp_radix_tree/
│       │   │   │   ├── common.h
│       │   │   │   ├── radix_tree.py
│       │   │   │   ├── tree_v2.cpp
│       │   │   │   ├── tree_v2.h
│       │   │   │   ├── tree_v2_binding.cpp
│       │   │   │   ├── tree_v2_debug.cpp
│       │   │   │   ├── tree_v2_impl.h
│       │   │   │   └── tree_v2_node.h
│       │   │   ├── evict_policy.py
│       │   │   ├── flush_cache.py
│       │   │   ├── hi_mamba_radix_cache.py
│       │   │   ├── hicache_storage.py
│       │   │   ├── hiradix_cache.py
│       │   │   ├── mamba_radix_cache.py
│       │   │   ├── memory_pool.py
│       │   │   ├── memory_pool_host.py
│       │   │   ├── multimodal_cache.py
│       │   │   ├── radix_cache.py
│       │   │   ├── radix_cache_cpp.py
│       │   │   ├── session_aware_cache.py
│       │   │   ├── sparsity/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── algorithms/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── base_algorithm.py
│       │   │   │   │   ├── deepseek_nsa.py
│       │   │   │   │   └── quest_algorithm.py
│       │   │   │   ├── backend/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── backend_adaptor.py
│       │   │   │   ├── core/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   └── sparse_coordinator.py
│       │   │   │   └── factory.py
│       │   │   ├── storage/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── aibrix_kvcache/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── aibrix_kvcache_storage.py
│       │   │   │   │   └── unit_test.py
│       │   │   │   ├── backend_factory.py
│       │   │   │   ├── eic/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── eic_storage.py
│       │   │   │   │   └── test_unit.py
│       │   │   │   ├── hf3fs/
│       │   │   │   │   ├── docs/
│       │   │   │   │   │   ├── README.md
│       │   │   │   │   │   ├── deploy_sglang_3fs_multinode.md
│       │   │   │   │   │   └── setup_usrbio_client.md
│       │   │   │   │   ├── hf3fs_client.py
│       │   │   │   │   ├── hf3fs_usrbio_client.py
│       │   │   │   │   ├── hf3fs_utils.cpp
│       │   │   │   │   ├── mini_3fs_metadata_server.py
│       │   │   │   │   ├── storage_hf3fs.py
│       │   │   │   │   └── test_hf3fs_utils.py
│       │   │   │   ├── lmcache/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── example_config.yaml
│       │   │   │   │   ├── lmc_radix_cache.py
│       │   │   │   │   └── unit_test.py
│       │   │   │   ├── mooncake_store/
│       │   │   │   │   ├── README.md
│       │   │   │   │   ├── embedding_cache_controller.py
│       │   │   │   │   ├── mooncake_embedding_store.py
│       │   │   │   │   ├── mooncake_store.py
│       │   │   │   │   └── test_mooncake_store.py
│       │   │   │   └── nixl/
│       │   │   │       ├── README.md
│       │   │   │       ├── hicache_nixl.py
│       │   │   │       ├── nixl.config.toml.sample
│       │   │   │       ├── nixl_utils.py
│       │   │   │       └── test_hicache_nixl_storage.py
│       │   │   ├── swa_memory_pool.py
│       │   │   ├── swa_radix_cache.py
│       │   │   └── utils.py
│       │   ├── model_executor/
│       │   │   ├── cpu_graph_runner.py
│       │   │   ├── cuda_graph_runner.py
│       │   │   ├── forward_batch_deepseek_mha_mixin.py
│       │   │   ├── forward_batch_info.py
│       │   │   ├── hook_manager.py
│       │   │   ├── input_buffers.py
│       │   │   ├── mindspore_runner.py
│       │   │   ├── model_runner.py
│       │   │   ├── model_runner_kv_cache_mixin.py
│       │   │   └── piecewise_cuda_graph_runner.py
│       │   ├── model_loader/
│       │   │   ├── __init__.py
│       │   │   ├── ci_weight_validation.py
│       │   │   ├── loader.py
│       │   │   ├── remote_instance_weight_loader_utils.py
│       │   │   ├── utils.py
│       │   │   └── weight_utils.py
│       │   ├── models/
│       │   │   ├── afmoe.py
│       │   │   ├── apertus.py
│       │   │   ├── arcee.py
│       │   │   ├── baichuan.py
│       │   │   ├── bailing_moe.py
│       │   │   ├── bailing_moe_linear.py
│       │   │   ├── bailing_moe_nextn.py
│       │   │   ├── bert.py
│       │   │   ├── chatglm.py
│       │   │   ├── clip.py
│       │   │   ├── commandr.py
│       │   │   ├── dbrx.py
│       │   │   ├── deepseek.py
│       │   │   ├── deepseek_common/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── attention_backend_handler.py
│       │   │   │   ├── attention_forward_methods/
│       │   │   │   │   ├── __init__.py
│       │   │   │   │   ├── forward_methods.py
│       │   │   │   │   ├── forward_mha.py
│       │   │   │   │   ├── forward_mla.py
│       │   │   │   │   ├── forward_mla_fused_rope_cpu.py
│       │   │   │   │   └── forward_mla_fused_rope_rocm.py
│       │   │   │   ├── deepseek_weight_loader.py
│       │   │   │   └── utils.py
│       │   │   ├── deepseek_janus_pro.py
│       │   │   ├── deepseek_nextn.py
│       │   │   ├── deepseek_ocr.py
│       │   │   ├── deepseek_v2.py
│       │   │   ├── deepseek_vl2.py
│       │   │   ├── dots_ocr.py
│       │   │   ├── dots_vlm.py
│       │   │   ├── dots_vlm_vit.py
│       │   │   ├── ernie4.py
│       │   │   ├── ernie45_moe_vl.py
│       │   │   ├── ernie45_vl.py
│       │   │   ├── ernie4_eagle.py
│       │   │   ├── exaone.py
│       │   │   ├── exaone4.py
│       │   │   ├── exaone_moe.py
│       │   │   ├── exaone_moe_mtp.py
│       │   │   ├── falcon_h1.py
│       │   │   ├── gemma.py
│       │   │   ├── gemma2.py
│       │   │   ├── gemma2_reward.py
│       │   │   ├── gemma3_causal.py
│       │   │   ├── gemma3_mm.py
│       │   │   ├── gemma3n_audio.py
│       │   │   ├── gemma3n_causal.py
│       │   │   ├── gemma3n_mm.py
│       │   │   ├── glm4.py
│       │   │   ├── glm4_moe.py
│       │   │   ├── glm4_moe_lite.py
│       │   │   ├── glm4_moe_nextn.py
│       │   │   ├── glm4v.py
│       │   │   ├── glm4v_moe.py
│       │   │   ├── glm_ocr.py
│       │   │   ├── glm_ocr_nextn.py
│       │   │   ├── glmasr.py
│       │   │   ├── gpt2.py
│       │   │   ├── gpt_bigcode.py
│       │   │   ├── gpt_j.py
│       │   │   ├── gpt_oss.py
│       │   │   ├── granite.py
│       │   │   ├── granitemoe.py
│       │   │   ├── granitemoehybrid.py
│       │   │   ├── grok.py
│       │   │   ├── hunyuan.py
│       │   │   ├── idefics2.py
│       │   │   ├── internlm2.py
│       │   │   ├── internlm2_reward.py
│       │   │   ├── interns1.py
│       │   │   ├── interns1pro.py
│       │   │   ├── internvl.py
│       │   │   ├── iquest_loopcoder.py
│       │   │   ├── jet_nemotron.py
│       │   │   ├── jet_vlm.py
│       │   │   ├── kimi_k25.py
│       │   │   ├── kimi_linear.py
│       │   │   ├── kimi_vl.py
│       │   │   ├── kimi_vl_moonvit.py
│       │   │   ├── lfm2.py
│       │   │   ├── lfm2_moe.py
│       │   │   ├── lightonocr.py
│       │   │   ├── llada2.py
│       │   │   ├── llama.py
│       │   │   ├── llama4.py
│       │   │   ├── llama_classification.py
│       │   │   ├── llama_eagle.py
│       │   │   ├── llama_eagle3.py
│       │   │   ├── llama_embedding.py
│       │   │   ├── llama_reward.py
│       │   │   ├── llava.py
│       │   │   ├── llavavid.py
│       │   │   ├── longcat_flash.py
│       │   │   ├── longcat_flash_nextn.py
│       │   │   ├── midashenglm.py
│       │   │   ├── mimo.py
│       │   │   ├── mimo_mtp.py
│       │   │   ├── mimo_v2_flash.py
│       │   │   ├── mimo_v2_flash_nextn.py
│       │   │   ├── mindspore.py
│       │   │   ├── minicpm.py
│       │   │   ├── minicpm3.py
│       │   │   ├── minicpmo.py
│       │   │   ├── minicpmv.py
│       │   │   ├── minimax_m2.py
│       │   │   ├── ministral3.py
│       │   │   ├── mistral.py
│       │   │   ├── mistral_large_3.py
│       │   │   ├── mistral_large_3_eagle.py
│       │   │   ├── mixtral.py
│       │   │   ├── mixtral_quant.py
│       │   │   ├── mllama.py
│       │   │   ├── mllama4.py
│       │   │   ├── nano_nemotron_vl.py
│       │   │   ├── nemotron_h.py
│       │   │   ├── nemotron_h_mtp.py
│       │   │   ├── nemotron_nas.py
│       │   │   ├── nvila.py
│       │   │   ├── nvila_lite.py
│       │   │   ├── olmo.py
│       │   │   ├── olmo2.py
│       │   │   ├── olmoe.py
│       │   │   ├── opt.py
│       │   │   ├── orion.py
│       │   │   ├── paddleocr_vl.py
│       │   │   ├── persimmon.py
│       │   │   ├── phi.py
│       │   │   ├── phi3_small.py
│       │   │   ├── phi4mm.py
│       │   │   ├── phi4mm_audio.py
│       │   │   ├── phi4mm_utils.py
│       │   │   ├── phimoe.py
│       │   │   ├── pixtral.py
│       │   │   ├── points_v15_chat.py
│       │   │   ├── qwen.py
│       │   │   ├── qwen2.py
│       │   │   ├── qwen2_5_vl.py
│       │   │   ├── qwen2_audio.py
│       │   │   ├── qwen2_classification.py
│       │   │   ├── qwen2_eagle.py
│       │   │   ├── qwen2_moe.py
│       │   │   ├── qwen2_rm.py
│       │   │   ├── qwen2_vl.py
│       │   │   ├── qwen3.py
│       │   │   ├── qwen3_5.py
│       │   │   ├── qwen3_5_mtp.py
│       │   │   ├── qwen3_classification.py
│       │   │   ├── qwen3_moe.py
│       │   │   ├── qwen3_next.py
│       │   │   ├── qwen3_next_mtp.py
│       │   │   ├── qwen3_omni_moe.py
│       │   │   ├── qwen3_rm.py
│       │   │   ├── qwen3_vl.py
│       │   │   ├── qwen3_vl_moe.py
│       │   │   ├── radio.py
│       │   │   ├── registry.py
│       │   │   ├── roberta.py
│       │   │   ├── sarashina2_vision.py
│       │   │   ├── sarvam_moe.py
│       │   │   ├── sdar.py
│       │   │   ├── sdar_moe.py
│       │   │   ├── siglip.py
│       │   │   ├── solar.py
│       │   │   ├── stablelm.py
│       │   │   ├── starcoder2.py
│       │   │   ├── step3_vl.py
│       │   │   ├── step3_vl_10b.py
│       │   │   ├── step3p5.py
│       │   │   ├── step3p5_mtp.py
│       │   │   ├── teleflm.py
│       │   │   ├── torch_native_llama.py
│       │   │   ├── transformers.py
│       │   │   ├── utils.py
│       │   │   ├── whisper.py
│       │   │   ├── xverse.py
│       │   │   ├── xverse_moe.py
│       │   │   └── yivl.py
│       │   ├── multimodal/
│       │   │   ├── customized_mm_processor_utils.py
│       │   │   ├── evs/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── evs_core.py
│       │   │   │   ├── evs_module.py
│       │   │   │   └── evs_processor.py
│       │   │   ├── internvl_utils.py
│       │   │   ├── internvl_vit_cuda_graph_runner.py
│       │   │   ├── mm_utils.py
│       │   │   ├── processors/
│       │   │   │   ├── base_processor.py
│       │   │   │   ├── clip.py
│       │   │   │   ├── deepseek_ocr.py
│       │   │   │   ├── deepseek_vl_v2.py
│       │   │   │   ├── dots_vlm.py
│       │   │   │   ├── ernie45_vl.py
│       │   │   │   ├── gemma3.py
│       │   │   │   ├── gemma3n.py
│       │   │   │   ├── glm4v.py
│       │   │   │   ├── glmasr.py
│       │   │   │   ├── interns1pro.py
│       │   │   │   ├── internvl.py
│       │   │   │   ├── janus_pro.py
│       │   │   │   ├── kimi_k25.py
│       │   │   │   ├── kimi_vl.py
│       │   │   │   ├── lightonocr.py
│       │   │   │   ├── llava.py
│       │   │   │   ├── midashenglm.py
│       │   │   │   ├── minicpm.py
│       │   │   │   ├── mlama.py
│       │   │   │   ├── mllama4.py
│       │   │   │   ├── nano_nemotron_vl.py
│       │   │   │   ├── nvila.py
│       │   │   │   ├── paddleocr_vlm.py
│       │   │   │   ├── phi4mm.py
│       │   │   │   ├── pixtral.py
│       │   │   │   ├── points_v15_chat.py
│       │   │   │   ├── qwen_audio.py
│       │   │   │   ├── qwen_vl.py
│       │   │   │   ├── sarashina2_vision.py
│       │   │   │   ├── step3_vl.py
│       │   │   │   └── whisper.py
│       │   │   └── vit_cuda_graph_runner.py
│       │   ├── multiplex/
│       │   │   ├── multiplexing_mixin.py
│       │   │   └── pdmux_context.py
│       │   ├── observability/
│       │   │   ├── cpu_monitor.py
│       │   │   ├── func_timer.py
│       │   │   ├── label_transform.py
│       │   │   ├── metrics_collector.py
│       │   │   ├── req_time_stats.py
│       │   │   ├── request_metrics_exporter.py
│       │   │   ├── scheduler_metrics_mixin.py
│       │   │   ├── startup_func_log_and_timer.py
│       │   │   ├── trace.py
│       │   │   └── utils.py
│       │   ├── parser/
│       │   │   ├── code_completion_parser.py
│       │   │   ├── conversation.py
│       │   │   ├── harmony_parser.py
│       │   │   ├── jinja_template_utils.py
│       │   │   └── reasoning_parser.py
│       │   ├── ray/
│       │   │   ├── __init__.py
│       │   │   ├── engine.py
│       │   │   ├── http_server.py
│       │   │   └── scheduler_actor.py
│       │   ├── sampling/
│       │   │   ├── custom_logit_processor.py
│       │   │   ├── penaltylib/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── frequency_penalty.py
│       │   │   │   ├── min_new_tokens.py
│       │   │   │   ├── orchestrator.py
│       │   │   │   └── presence_penalty.py
│       │   │   ├── sampling_batch_info.py
│       │   │   └── sampling_params.py
│       │   ├── server_args.py
│       │   ├── server_args_config_parser.py
│       │   ├── speculative/
│       │   │   ├── base_spec_worker.py
│       │   │   ├── cpp_ngram/
│       │   │   │   ├── .clang-format
│       │   │   │   ├── ngram.cpp
│       │   │   │   ├── ngram.h
│       │   │   │   ├── ngram_cache.py
│       │   │   │   ├── ngram_cache_binding.cpp
│       │   │   │   ├── param.h
│       │   │   │   └── queue.h
│       │   │   ├── draft_utils.py
│       │   │   ├── eagle_draft_cuda_graph_runner.py
│       │   │   ├── eagle_draft_extend_cuda_graph_runner.py
│       │   │   ├── eagle_info.py
│       │   │   ├── eagle_info_v2.py
│       │   │   ├── eagle_utils.py
│       │   │   ├── eagle_worker.py
│       │   │   ├── eagle_worker_v2.py
│       │   │   ├── multi_layer_eagle_draft_extend_cuda_graph_runner.py
│       │   │   ├── multi_layer_eagle_utils.py
│       │   │   ├── multi_layer_eagle_worker.py
│       │   │   ├── multi_layer_eagle_worker_v2.py
│       │   │   ├── ngram_info.py
│       │   │   ├── ngram_worker.py
│       │   │   ├── spec_info.py
│       │   │   ├── spec_utils.py
│       │   │   ├── standalone_worker.py
│       │   │   └── standalone_worker_v2.py
│       │   ├── tokenizer/
│       │   │   └── tiktoken_tokenizer.py
│       │   ├── utils/
│       │   │   ├── __init__.py
│       │   │   ├── aio_rwlock.py
│       │   │   ├── auth.py
│       │   │   ├── bench_utils.py
│       │   │   ├── common.py
│       │   │   ├── cuda_ipc_transport_utils.py
│       │   │   ├── custom_op.py
│       │   │   ├── device_timer.py
│       │   │   ├── gauge_histogram.py
│       │   │   ├── hf_transformers_utils.py
│       │   │   ├── host_shared_memory.py
│       │   │   ├── json_response.py
│       │   │   ├── log_utils.py
│       │   │   ├── mistral_utils.py
│       │   │   ├── model_file_verifier.py
│       │   │   ├── multi_stream_utils.py
│       │   │   ├── network.py
│       │   │   ├── numa_utils.py
│       │   │   ├── nvtx_pytorch_hooks.py
│       │   │   ├── offloader.py
│       │   │   ├── patch_tokenizer.py
│       │   │   ├── patch_torch.py
│       │   │   ├── poll_based_barrier.py
│       │   │   ├── profile_merger.py
│       │   │   ├── profile_utils.py
│       │   │   ├── request_logger.py
│       │   │   ├── rpd_utils.py
│       │   │   ├── scheduler_status_logger.py
│       │   │   ├── slow_rank_detector.py
│       │   │   ├── torch_memory_saver_adapter.py
│       │   │   ├── video_decoder.py
│       │   │   ├── watchdog.py
│       │   │   └── weight_checker.py
│       │   └── weight_sync/
│       │       ├── tensor_bucket.py
│       │       └── utils.py
│       ├── test/
│       │   ├── __init__.py
│       │   ├── accuracy_test_runner.py
│       │   ├── ascend/
│       │   │   ├── __init__.py
│       │   │   ├── disaggregation_utils.py
│       │   │   ├── gsm8k_ascend_mixin.py
│       │   │   ├── test_ascend_utils.py
│       │   │   └── vlm_utils.py
│       │   ├── attention/
│       │   │   ├── __init__.py
│       │   │   ├── test_flashattn_backend.py
│       │   │   ├── test_flashattn_mla_backend.py
│       │   │   ├── test_prefix_chunk_info.py
│       │   │   └── test_trtllm_mla_backend.py
│       │   ├── bench_one_batch_server_internal.py
│       │   ├── ci/
│       │   │   ├── __init__.py
│       │   │   ├── ci_register.py
│       │   │   ├── ci_stress_utils.py
│       │   │   ├── ci_utils.py
│       │   │   └── run_with_retry.py
│       │   ├── doc_patch.py
│       │   ├── external_models/
│       │   │   └── custom_qwen2_vl.py
│       │   ├── few_shot_gsm8k.py
│       │   ├── few_shot_gsm8k_engine.py
│       │   ├── get_logits_ut.py
│       │   ├── gpt_oss_common.py
│       │   ├── kits/
│       │   │   ├── abort_timeout_kit.py
│       │   │   ├── cache_hit_kit.py
│       │   │   ├── ebnf_constrained_kit.py
│       │   │   ├── gsm8k_accuracy_kit.py
│       │   │   ├── json_constrained_kit.py
│       │   │   ├── kl_divergence_kit.py
│       │   │   ├── lm_eval_kit.py
│       │   │   ├── matched_stop_kit.py
│       │   │   ├── mmmu_vlm_kit.py
│       │   │   ├── prefix_cache_branching_kit.py
│       │   │   ├── radix_cache_server_kit.py
│       │   │   ├── regex_constrained_kit.py
│       │   │   └── spec_decoding_kit.py
│       │   ├── kl_test_utils.py
│       │   ├── long_prompt.txt
│       │   ├── longbench_v2/
│       │   │   ├── __init__.py
│       │   │   ├── longbench_v2_evaluation.md
│       │   │   ├── test_longbench_v2_eval.py
│       │   │   ├── validate_longbench_v2.py
│       │   │   └── validate_longbench_v2_standalone.py
│       │   ├── lora_utils.py
│       │   ├── nightly_bench_utils.py
│       │   ├── nightly_utils.py
│       │   ├── performance_test_runner.py
│       │   ├── run_combined_tests.py
│       │   ├── run_eval.py
│       │   ├── runners.py
│       │   ├── send_one.py
│       │   ├── server_fixtures/
│       │   │   ├── default_fixture.py
│       │   │   ├── disaggregation_fixture.py
│       │   │   ├── eagle_fixture.py
│       │   │   └── mmmu_fixture.py
│       │   ├── simple_eval_aime25.py
│       │   ├── simple_eval_common.py
│       │   ├── simple_eval_gpqa.py
│       │   ├── simple_eval_gsm8k.py
│       │   ├── simple_eval_humaneval.py
│       │   ├── simple_eval_longbench_v2.py
│       │   ├── simple_eval_math.py
│       │   ├── simple_eval_mgsm.py
│       │   ├── simple_eval_mmlu.py
│       │   ├── simple_eval_mmmu_vlm.py
│       │   ├── speculative/
│       │   │   └── test_spec_utils.py
│       │   ├── test_activation.py
│       │   ├── test_block_fp8.py
│       │   ├── test_block_fp8_deep_gemm_blackwell.py
│       │   ├── test_custom_ops.py
│       │   ├── test_cutlass_moe.py
│       │   ├── test_cutlass_w16a16_moe.py
│       │   ├── test_cutlass_w4a8_moe.py
│       │   ├── test_deepep_utils.py
│       │   ├── test_deterministic.py
│       │   ├── test_deterministic_utils.py
│       │   ├── test_dump_metric.py
│       │   ├── test_dynamic_grad_mode.py
│       │   ├── test_flashinfer_dispatcher.py
│       │   ├── test_http_server_auth.py
│       │   ├── test_kvfp4_quant_dequant.py
│       │   ├── test_layernorm.py
│       │   ├── test_marlin_utils.py
│       │   ├── test_programs.py
│       │   ├── test_utils.py
│       │   ├── tool_call_test_runner.py
│       │   └── vlm_utils.py
│       ├── utils.py
│       └── version.py
├── scripts/
│   ├── check_vram_clear.sh
│   ├── ci/
│   │   ├── amd/
│   │   │   ├── amd_ci_exec.sh
│   │   │   ├── amd_ci_install_dependency.sh
│   │   │   ├── amd_ci_start_container.sh
│   │   │   ├── amd_ci_start_container_disagg.sh
│   │   │   ├── amd_ci_warmup_aiter.py
│   │   │   └── test_rccl_multi_gpu.py
│   │   ├── cuda/
│   │   │   ├── ci_download_flashinfer_cubin.sh
│   │   │   ├── ci_install_deepep.sh
│   │   │   ├── ci_install_dependency.sh
│   │   │   ├── ci_install_gateway_dependencies.sh
│   │   │   ├── ci_start_disaggregation_servers.sh
│   │   │   ├── prepare_runner.sh
│   │   │   ├── warmup_deep_gemm.py
│   │   │   └── warmup_server.py
│   │   ├── musa/
│   │   │   ├── musa_install_dependency.sh
│   │   │   └── rename_wheels_musa.sh
│   │   ├── npu/
│   │   │   ├── npu_ci_install_dependency.sh
│   │   │   └── npu_log_print.sh
│   │   └── utils/
│   │       ├── ci_coverage_report.py
│   │       ├── cleanup_hf_cache.py
│   │       ├── merge_metrics.py
│   │       ├── prevalidate_cached_models.py
│   │       ├── publish_diffusion_gt.py
│   │       ├── publish_traces.py
│   │       ├── query_job_status.py
│   │       ├── runner_utilization_report.py
│   │       ├── save_diffusion_metrics.py
│   │       ├── save_metrics.py
│   │       └── slash_command_handler.py
│   ├── ci_monitor/
│   │   ├── README.md
│   │   ├── ci_failures_analysis.py
│   │   └── post_ci_failures_to_slack.py
│   ├── code_sync/
│   │   ├── check_commits.py
│   │   ├── copy_from_oss.py
│   │   ├── copy_to_oss.py
│   │   ├── guideline.md
│   │   ├── install_github_cli.sh
│   │   └── utils.py
│   ├── convert_otel_2_perfetto.py
│   ├── ensure_vram_clear.sh
│   ├── export_deepseek_nextn.py
│   ├── killall_sglang.sh
│   ├── playground/
│   │   ├── bench_speculative.py
│   │   ├── disaggregation/
│   │   │   ├── cli-logprob.py
│   │   │   ├── cli-so.py
│   │   │   └── cli.py
│   │   ├── frontend_reasoning.ipynb
│   │   ├── load_tokenizer.py
│   │   ├── long_context_example.py
│   │   ├── lora/
│   │   │   ├── analyzer.py
│   │   │   ├── lora_hf_play.py
│   │   │   └── lora_vllm_play.py
│   │   ├── reference_hf.py
│   │   ├── replay_request_dump.py
│   │   └── router/
│   │       ├── test_tree.py
│   │       └── tree.py
│   ├── release/
│   │   ├── README.md
│   │   ├── bump_flashinfer_version.py
│   │   ├── bump_kernel_version.py
│   │   ├── bump_kernel_version_to_sglang.py
│   │   ├── bump_sglang_version.py
│   │   ├── check_kernel_version_to_sglang.py
│   │   ├── commit_and_pr.sh
│   │   ├── commit_and_pr_kernel_to_sglang.sh
│   │   ├── test_utils.py
│   │   └── utils.py
│   ├── sort_testcases_alphabetically.py
│   ├── update_kernel_whl_index.py
│   ├── update_nightly_whl_index.py
│   ├── update_pr_whl_index.py
│   └── version_branch_to_tag.sh
├── sgl-kernel/
│   ├── .clang-format
│   ├── CMakeLists.txt
│   ├── Dockerfile
│   ├── LICENSE
│   ├── Makefile
│   ├── README.md
│   ├── THIRDPARTYNOTICES.txt
│   ├── analyze_whl_kernel_sizes.py
│   ├── benchmark/
│   │   ├── bench_activation.py
│   │   ├── bench_amd_deterministic_allreduce.py
│   │   ├── bench_awq_dequant.py
│   │   ├── bench_cutlass_mla.py
│   │   ├── bench_dsv3_fused_a_gemm.py
│   │   ├── bench_dsv3_router_gemm.py
│   │   ├── bench_es_fp8_blockwise_grouped_gemm.py
│   │   ├── bench_fp4_gemm.py
│   │   ├── bench_fp8_blockwise_gemm.py
│   │   ├── bench_fp8_blockwise_group_gemm.py
│   │   ├── bench_fp8_gemm.py
│   │   ├── bench_int8_gemm.py
│   │   ├── bench_kimi_k2_moe_fused_gate.py
│   │   ├── bench_moe_align_block_size.py
│   │   ├── bench_moe_ep_post_reorder.py
│   │   ├── bench_moe_fused_gate.py
│   │   ├── bench_moe_topk_sigmoid.py
│   │   ├── bench_moe_topk_softmax.py
│   │   ├── bench_mrope.py
│   │   ├── bench_per_tensor_quant_fp8.py
│   │   ├── bench_per_token_group_quant_8bit.py
│   │   ├── bench_per_token_quant_fp8.py
│   │   ├── bench_qserve_w4a8_gemm.py
│   │   ├── bench_rmsnorm.py
│   │   ├── bench_rotary_embedding.py
│   │   ├── bench_sum_scale.py
│   │   └── bench_top_k_top_p_sampling.py
│   ├── build.sh
│   ├── cmake/
│   │   ├── flashmla.cmake
│   │   └── utils.cmake
│   ├── csrc/
│   │   ├── allreduce/
│   │   │   ├── custom_all_reduce.cu
│   │   │   ├── custom_all_reduce.cuh
│   │   │   ├── custom_all_reduce.hip
│   │   │   ├── custom_all_reduce_hip.cuh
│   │   │   ├── deterministic_all_reduce.hip
│   │   │   ├── mscclpp_allreduce.cu
│   │   │   ├── mscclpp_allreduce.cuh
│   │   │   ├── quick_all_reduce.cu
│   │   │   ├── quick_all_reduce.cuh
│   │   │   ├── quick_all_reduce.h
│   │   │   ├── quick_all_reduce_base.h
│   │   │   └── test_mscclpp_allreduce.cu
│   │   ├── attention/
│   │   │   ├── cascade.cu
│   │   │   ├── cutlass_mla_kernel.cu
│   │   │   ├── cutlass_sm100_mla/
│   │   │   │   ├── device/
│   │   │   │   │   └── sm100_mla.hpp
│   │   │   │   └── kernel/
│   │   │   │       ├── sm100_fmha_mla_reduction.hpp
│   │   │   │       ├── sm100_fmha_mla_tma_warpspecialized.hpp
│   │   │   │       └── sm100_mla_tile_scheduler.hpp
│   │   │   ├── merge_attn_states.cu
│   │   │   └── vertical_slash_index.cu
│   │   ├── common_extension.cc
│   │   ├── common_extension_musa.cc
│   │   ├── common_extension_rocm.cc
│   │   ├── cpu/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── aarch64/
│   │   │   │   └── shm.h
│   │   │   ├── activation.cpp
│   │   │   ├── bmm.cpp
│   │   │   ├── common.h
│   │   │   ├── conv3d.cpp
│   │   │   ├── decode.cpp
│   │   │   ├── extend.cpp
│   │   │   ├── flash_attn.cpp
│   │   │   ├── flash_attn.h
│   │   │   ├── gemm.cpp
│   │   │   ├── gemm.h
│   │   │   ├── gemm_fp8.cpp
│   │   │   ├── gemm_int4.cpp
│   │   │   ├── gemm_int8.cpp
│   │   │   ├── interface.cpp
│   │   │   ├── mamba/
│   │   │   │   ├── conv.cpp
│   │   │   │   └── fla.cpp
│   │   │   ├── model/
│   │   │   │   └── qwen3.cpp
│   │   │   ├── moe.cpp
│   │   │   ├── moe_fp8.cpp
│   │   │   ├── moe_int4.cpp
│   │   │   ├── moe_int8.cpp
│   │   │   ├── norm.cpp
│   │   │   ├── numa_utils.cpp
│   │   │   ├── preprocessor.cpp
│   │   │   ├── qkv_proj.cpp
│   │   │   ├── rope.cpp
│   │   │   ├── shm.cpp
│   │   │   ├── shm.h
│   │   │   ├── topk.cpp
│   │   │   ├── torch_extension_cpu.cpp
│   │   │   ├── vec.h
│   │   │   ├── vec_pack.h
│   │   │   └── x86_64/
│   │   │       └── shm.h
│   │   ├── cutlass_extensions/
│   │   │   ├── common.hpp
│   │   │   ├── detail/
│   │   │   │   └── collective/
│   │   │   │       └── mixed_input_utils.hpp
│   │   │   ├── epilogue/
│   │   │   │   └── epilogue_per_row_per_col_scale.h
│   │   │   └── gemm/
│   │   │       ├── collective/
│   │   │       │   ├── builders/
│   │   │       │   │   └── sm90_gmma_builder_mixed_input.inl
│   │   │       │   ├── collective_builder_mixed_input.hpp
│   │   │       │   ├── collective_mma_array_mixed_input.hpp
│   │   │       │   └── sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
│   │   │       ├── cutlass_gemm_caller.cuh
│   │   │       ├── dispatch_policy.hpp
│   │   │       ├── fp8_blockwise_gemm_sm90_dispatch.cuh
│   │   │       ├── gemm_universal_base_compat.h
│   │   │       └── gemm_with_epilogue_visitor.h
│   │   ├── elementwise/
│   │   │   ├── activation.cu
│   │   │   ├── cast.cu
│   │   │   ├── concat_mla.cu
│   │   │   ├── copy.cu
│   │   │   ├── fused_add_rms_norm_kernel.cu
│   │   │   ├── pos_enc.cu
│   │   │   ├── pos_enc.cuh
│   │   │   ├── topk.cu
│   │   │   └── utils.cuh
│   │   ├── expert_specialization/
│   │   │   ├── es_fp8_blockwise.cu
│   │   │   ├── es_fp8_blockwise_functor.cuh
│   │   │   ├── es_fp8_blockwise_launcher.cuh
│   │   │   ├── es_fp8_blockwise_traits.cuh
│   │   │   ├── es_sm100_mxfp8_blockscaled.cu
│   │   │   ├── es_sm100_mxfp8_blockscaled_functor.cuh
│   │   │   ├── es_sm100_mxfp8_blockscaled_group_quant.cu
│   │   │   ├── es_sm100_mxfp8_blockscaled_group_quant.cuh
│   │   │   ├── es_sm100_mxfp8_blockscaled_launcher.cuh
│   │   │   └── es_sm100_mxfp8_blockscaled_traits.cuh
│   │   ├── flash_extension.cc
│   │   ├── flashmla_extension.cc
│   │   ├── gemm/
│   │   │   ├── awq_kernel.cu
│   │   │   ├── bmm_fp8.cu
│   │   │   ├── dsv3_fused_a_gemm.cu
│   │   │   ├── dsv3_router_gemm_bf16_out.cu
│   │   │   ├── dsv3_router_gemm_entry.cu
│   │   │   ├── dsv3_router_gemm_float_out.cu
│   │   │   ├── fp8_blockwise_gemm_kernel.cu
│   │   │   ├── fp8_gemm_kernel.cu
│   │   │   ├── gptq/
│   │   │   │   ├── compat.cuh
│   │   │   │   ├── gptq_kernel.cu
│   │   │   │   ├── matrix_view.cuh
│   │   │   │   ├── qdq_2.cuh
│   │   │   │   ├── qdq_3.cuh
│   │   │   │   ├── qdq_4.cuh
│   │   │   │   ├── qdq_8.cuh
│   │   │   │   └── qdq_util.cuh
│   │   │   ├── int8_gemm_kernel.cu
│   │   │   ├── marlin/
│   │   │   │   ├── dequant.h
│   │   │   │   ├── kernel.h
│   │   │   │   ├── marlin.cuh
│   │   │   │   ├── marlin_dtypes.cuh
│   │   │   │   └── marlin_template.h
│   │   │   ├── math.hpp
│   │   │   ├── per_token_group_quant_8bit.cu
│   │   │   ├── per_token_group_quant_8bit_v2.cu
│   │   │   ├── per_token_quant_fp8.cu
│   │   │   ├── qserve_w4a8_per_chn_gemm.cu
│   │   │   └── qserve_w4a8_per_group_gemm.cu
│   │   ├── grammar/
│   │   │   └── apply_token_bitmask_inplace_cuda.cu
│   │   ├── kvcacheio/
│   │   │   └── transfer.cu
│   │   ├── mamba/
│   │   │   ├── causal_conv1d.cu
│   │   │   └── causal_conv1d.h
│   │   ├── memory/
│   │   │   └── weak_ref_tensor.cpp
│   │   ├── moe/
│   │   │   ├── cutlass_moe/
│   │   │   │   └── w4a8/
│   │   │   │       ├── scaled_mm_entry.cu
│   │   │   │       ├── w4a8_get_group_starts.cuh
│   │   │   │       ├── w4a8_grouped_mm_c3x.cu
│   │   │   │       ├── w4a8_grouped_mm_c3x.cuh
│   │   │   │       └── w4a8_moe_data.cu
│   │   │   ├── cutlass_moe_helper.cu
│   │   │   ├── fp8_blockwise_moe_kernel.cu
│   │   │   ├── fused_qknorm_rope_kernel.cu
│   │   │   ├── kimi_k2_moe_fused_gate.cu
│   │   │   ├── moe_align_kernel.cu
│   │   │   ├── moe_fused_gate.cu
│   │   │   ├── moe_sum.cu
│   │   │   ├── moe_sum_reduce.cu
│   │   │   ├── moe_topk_sigmoid_kernels.cu
│   │   │   ├── moe_topk_softmax_kernels.cu
│   │   │   └── prepare_moe_input.cu
│   │   ├── quantization/
│   │   │   └── gguf/
│   │   │       ├── dequantize.cuh
│   │   │       ├── ggml-common.h
│   │   │       ├── gguf_kernel.cu
│   │   │       ├── mmq.cuh
│   │   │       ├── mmvq.cuh
│   │   │       ├── moe.cuh
│   │   │       ├── moe_vec.cuh
│   │   │       └── vecdotq.cuh
│   │   ├── spatial/
│   │   │   ├── cuda_utils.h
│   │   │   ├── greenctx_stream.cu
│   │   │   └── greenctx_stream.h
│   │   ├── spatial_extension.cc
│   │   └── speculative/
│   │       ├── eagle_utils.cu
│   │       ├── ngram_utils.cu
│   │       ├── packbit.cu
│   │       ├── speculative_sampling.cu
│   │       └── speculative_sampling.cuh
│   ├── include/
│   │   ├── hip/
│   │   │   ├── hip_act_and_mul.cuh
│   │   │   ├── hip_math_def.h
│   │   │   ├── hip_vec_dtypes.h
│   │   │   └── impl/
│   │   │       ├── hip_vec_bf16_impl.h
│   │   │       ├── hip_vec_fp32_impl.h
│   │   │       └── hip_vec_half_impl.h
│   │   ├── pytorch_extension_utils_rocm.h
│   │   ├── scalar_type.hpp
│   │   ├── sgl_flash_kernel_ops.h
│   │   ├── sgl_kernel_ops.h
│   │   ├── sgl_kernel_torch_shim.h
│   │   └── utils.h
│   ├── kernel-runner-setup.sh
│   ├── pyproject.toml
│   ├── pyproject_cpu.toml
│   ├── pyproject_musa.toml
│   ├── pyproject_rocm.toml
│   ├── python/
│   │   └── sgl_kernel/
│   │       ├── __init__.py
│   │       ├── _fa4_interface.py
│   │       ├── allreduce.py
│   │       ├── attention.py
│   │       ├── cutlass_moe.py
│   │       ├── elementwise.py
│   │       ├── expert_specialization.py
│   │       ├── flash_attn.py
│   │       ├── flash_mla.py
│   │       ├── gemm.py
│   │       ├── grammar.py
│   │       ├── kvcacheio.py
│   │       ├── load_utils.py
│   │       ├── mamba.py
│   │       ├── memory.py
│   │       ├── moe.py
│   │       ├── quantization/
│   │       │   ├── __init__.py
│   │       │   └── gguf.py
│   │       ├── sampling.py
│   │       ├── scalar_type.py
│   │       ├── sparse_flash_attn.py
│   │       ├── spatial.py
│   │       ├── speculative.py
│   │       ├── test_utils.py
│   │       ├── testing/
│   │       │   ├── __init__.py
│   │       │   └── rotary_embedding.py
│   │       ├── top_k.py
│   │       ├── utils.py
│   │       └── version.py
│   ├── rename_wheels.sh
│   ├── setup_musa.py
│   ├── setup_rocm.py
│   └── tests/
│       ├── conftest.py
│       ├── spatial/
│       │   └── test_greenctx_stream.py
│       ├── speculative/
│       │   ├── test_eagle_utils.py
│       │   ├── test_ngram_utils.py
│       │   └── test_speculative_sampling.py
│       ├── test_activation.py
│       ├── test_amd_deterministic_custom_allreduce.py
│       ├── test_amd_nccl_allreduce_determinism.py
│       ├── test_apply_token_bitmask_inplace.py
│       ├── test_awq_dequant.py
│       ├── test_bmm_fp8.py
│       ├── test_causal_conv1d.py
│       ├── test_copy.py
│       ├── test_custom_allreduce.py
│       ├── test_cutlass_mla.py
│       ├── test_cutlass_w4a8_moe_mm.py
│       ├── test_dsv3_fused_a_gemm.py
│       ├── test_dsv3_router_gemm.py
│       ├── test_es_fp8_blockwise_moe.py
│       ├── test_es_mxfp8_blockscaled_moe.py
│       ├── test_flash_attention.py
│       ├── test_flash_attn_sparse.py
│       ├── test_flashmla.py
│       ├── test_fp8_blockwise_gemm.py
│       ├── test_fp8_blockwise_moe.py
│       ├── test_fp8_gemm.py
│       ├── test_fused_qk_norm_rope.py
│       ├── test_gguf.py
│       ├── test_gptq_kernel.py
│       ├── test_hadamard.py
│       ├── test_int8_gemm.py
│       ├── test_kimi_k2_moe_fused_gate.py
│       ├── test_kvcacheio.py
│       ├── test_merge_state.py
│       ├── test_merge_state_v2.py
│       ├── test_moe_align.py
│       ├── test_moe_fused_gate.py
│       ├── test_moe_topk_sigmoid.py
│       ├── test_moe_topk_softmax.py
│       ├── test_mscclpp.py
│       ├── test_norm.py
│       ├── test_per_token_group_quant_8bit.py
│       ├── test_per_token_quant_fp8.py
│       ├── test_qserve_w4a8_per_chn_gemm.py
│       ├── test_qserve_w4a8_per_group_gemm.py
│       ├── test_sampling.py
│       ├── test_topk.py
│       ├── test_torch_defaults_reset.py
│       └── utils.py
├── sgl-model-gateway/
│   ├── .cargo/
│   │   └── config.toml
│   ├── Cargo.toml
│   ├── Makefile
│   ├── README.md
│   ├── benches/
│   │   ├── consistent_hash_bench.rs
│   │   ├── manual_policy_benchmark.rs
│   │   ├── request_processing.rs
│   │   ├── router_registry_bench.rs
│   │   ├── tree_benchmark.rs
│   │   └── wasm_middleware_latency.rs
│   ├── bindings/
│   │   ├── golang/
│   │   │   ├── .gitignore
│   │   │   ├── Cargo.toml
│   │   │   ├── Makefile
│   │   │   ├── README.md
│   │   │   ├── client.go
│   │   │   ├── client_test.go
│   │   │   ├── examples/
│   │   │   │   ├── oai_server/
│   │   │   │   │   ├── Makefile
│   │   │   │   │   ├── README.md
│   │   │   │   │   ├── config/
│   │   │   │   │   │   └── config.go
│   │   │   │   │   ├── docs/
│   │   │   │   │   │   └── benchmark_result.md
│   │   │   │   │   ├── go.sum
│   │   │   │   │   ├── handlers/
│   │   │   │   │   │   ├── chat.go
│   │   │   │   │   │   ├── health.go
│   │   │   │   │   │   └── models.go
│   │   │   │   │   ├── logger/
│   │   │   │   │   │   └── logger.go
│   │   │   │   │   ├── main.go
│   │   │   │   │   ├── models/
│   │   │   │   │   │   └── chat.go
│   │   │   │   │   ├── run.sh
│   │   │   │   │   ├── scripts/
│   │   │   │   │   │   ├── analyze_tpot.sh
│   │   │   │   │   │   ├── pprof_analysis.sh
│   │   │   │   │   │   ├── pprof_quick.sh
│   │   │   │   │   │   ├── pprof_test.sh
│   │   │   │   │   │   └── profile_tpot.sh
│   │   │   │   │   ├── service/
│   │   │   │   │   │   └── sglang.go
│   │   │   │   │   └── utils/
│   │   │   │   │       └── utils.go
│   │   │   │   ├── simple/
│   │   │   │   │   ├── main.go
│   │   │   │   │   └── run.sh
│   │   │   │   └── streaming/
│   │   │   │       ├── main.go
│   │   │   │       └── run.sh
│   │   │   ├── go.sum
│   │   │   ├── integration_test.go
│   │   │   ├── internal/
│   │   │   │   ├── ffi/
│   │   │   │   │   ├── batch_postprocessor.go
│   │   │   │   │   ├── client.go
│   │   │   │   │   ├── grpc_converter.go
│   │   │   │   │   ├── postprocessor.go
│   │   │   │   │   └── preprocessor.go
│   │   │   │   ├── grpc/
│   │   │   │   │   └── client_grpc.go
│   │   │   │   └── proto/
│   │   │   │       ├── sglang_scheduler.pb.go
│   │   │   │       └── sglang_scheduler_grpc.pb.go
│   │   │   └── src/
│   │   │       ├── client.rs
│   │   │       ├── error.rs
│   │   │       ├── grpc_converter.rs
│   │   │       ├── lib.rs
│   │   │       ├── memory.rs
│   │   │       ├── postprocessor.rs
│   │   │       ├── preprocessor.rs
│   │   │       ├── stream.rs
│   │   │       ├── tokenizer.rs
│   │   │       ├── tool_parser.rs
│   │   │       └── utils.rs
│   │   └── python/
│   │       ├── .coveragerc
│   │       ├── Cargo.toml
│   │       ├── MANIFEST.in
│   │       ├── README.md
│   │       ├── pyproject.toml
│   │       ├── setup.py
│   │       ├── src/
│   │       │   ├── lib.rs
│   │       │   └── sglang_router/
│   │       │       ├── __init__.py
│   │       │       ├── __main__.py
│   │       │       ├── cli.py
│   │       │       ├── launch_router.py
│   │       │       ├── launch_server.py
│   │       │       ├── mini_lb.py
│   │       │       ├── router.py
│   │       │       ├── router_args.py
│   │       │       └── version.py
│   │       └── tests/
│   │           ├── conftest.py
│   │           ├── test_arg_parser.py
│   │           ├── test_router_config.py
│   │           ├── test_startup_sequence.py
│   │           └── test_validation.py
│   ├── build.rs
│   ├── e2e_test/
│   │   ├── __init__.py
│   │   ├── benchmarks/
│   │   │   ├── __init__.py
│   │   │   ├── conftest.py
│   │   │   ├── results.py
│   │   │   ├── summarize.py
│   │   │   ├── test_pd_perf.py
│   │   │   └── test_regular_perf.py
│   │   ├── chat_completions/
│   │   │   ├── __init__.py
│   │   │   ├── test_enable_thinking.py
│   │   │   ├── test_function_calling.py
│   │   │   ├── test_openai_server.py
│   │   │   ├── test_reasoning_content.py
│   │   │   └── test_validation.py
│   │   ├── conftest.py
│   │   ├── embeddings/
│   │   │   ├── __init__.py
│   │   │   ├── test_basic.py
│   │   │   └── test_correctness.py
│   │   ├── fixtures/
│   │   │   ├── __init__.py
│   │   │   ├── hooks.py
│   │   │   ├── markers.py
│   │   │   ├── pool.py
│   │   │   ├── ports.py
│   │   │   └── setup_backend.py
│   │   ├── infra/
│   │   │   ├── __init__.py
│   │   │   ├── constants.py
│   │   │   ├── gateway.py
│   │   │   ├── gpu_allocator.py
│   │   │   ├── gpu_monitor.py
│   │   │   ├── model_pool.py
│   │   │   ├── model_specs.py
│   │   │   ├── process_utils.py
│   │   │   ├── run_eval.py
│   │   │   ├── simple_eval_common.py
│   │   │   └── simple_eval_mmlu.py
│   │   ├── pyproject.toml
│   │   ├── responses/
│   │   │   ├── __init__.py
│   │   │   ├── test_basic_crud.py
│   │   │   ├── test_state_management.py
│   │   │   ├── test_streaming_events.py
│   │   │   ├── test_structured_output.py
│   │   │   └── test_tools_call.py
│   │   └── router/
│   │       ├── __init__.py
│   │       ├── test_mmlu.py
│   │       ├── test_pd_mmlu.py
│   │       └── test_worker_api.py
│   ├── examples/
│   │   └── wasm/
│   │       ├── .gitignore
│   │       ├── README.md
│   │       ├── wasm-guest-auth/
│   │       │   ├── Cargo.toml
│   │       │   ├── README.md
│   │       │   ├── build.sh
│   │       │   └── src/
│   │       │       └── lib.rs
│   │       ├── wasm-guest-logging/
│   │       │   ├── Cargo.toml
│   │       │   ├── README.md
│   │       │   ├── build.sh
│   │       │   └── src/
│   │       │       └── lib.rs
│   │       └── wasm-guest-ratelimit/
│   │           ├── Cargo.toml
│   │           ├── README.md
│   │           ├── build.sh
│   │           └── src/
│   │               └── lib.rs
│   ├── pytest.ini
│   ├── rustfmt.toml
│   ├── scripts/
│   │   ├── generate_gateway_release_notes.sh
│   │   ├── generate_vision_golden.py
│   │   ├── run_benchmarks.py
│   │   └── setup-sccache.sh
│   ├── src/
│   │   ├── app_context.rs
│   │   ├── config/
│   │   │   ├── builder.rs
│   │   │   ├── mod.rs
│   │   │   ├── types.rs
│   │   │   └── validation.rs
│   │   ├── core/
│   │   │   ├── circuit_breaker.rs
│   │   │   ├── error.rs
│   │   │   ├── job_queue.rs
│   │   │   ├── metrics_aggregator.rs
│   │   │   ├── mod.rs
│   │   │   ├── model_card.rs
│   │   │   ├── model_type.rs
│   │   │   ├── retry.rs
│   │   │   ├── steps/
│   │   │   │   ├── mcp_registration.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── tokenizer_registration.rs
│   │   │   │   ├── wasm_module_registration.rs
│   │   │   │   ├── wasm_module_removal.rs
│   │   │   │   ├── worker/
│   │   │   │   │   ├── external/
│   │   │   │   │   │   ├── create_workers.rs
│   │   │   │   │   │   ├── discover_models.rs
│   │   │   │   │   │   └── mod.rs
│   │   │   │   │   ├── local/
│   │   │   │   │   │   ├── create_worker.rs
│   │   │   │   │   │   ├── detect_connection.rs
│   │   │   │   │   │   ├── discover_dp.rs
│   │   │   │   │   │   ├── discover_metadata.rs
│   │   │   │   │   │   ├── find_worker_to_update.rs
│   │   │   │   │   │   ├── find_workers_to_remove.rs
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── remove_from_policy_registry.rs
│   │   │   │   │   │   ├── remove_from_worker_registry.rs
│   │   │   │   │   │   ├── submit_tokenizer_job.rs
│   │   │   │   │   │   ├── update_policies_for_worker.rs
│   │   │   │   │   │   ├── update_remaining_policies.rs
│   │   │   │   │   │   └── update_worker_properties.rs
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   └── shared/
│   │   │   │   │       ├── activate.rs
│   │   │   │   │       ├── mod.rs
│   │   │   │   │       ├── register.rs
│   │   │   │   │       └── update_policies.rs
│   │   │   │   ├── workflow_data.rs
│   │   │   │   └── workflow_engines.rs
│   │   │   ├── token_bucket.rs
│   │   │   ├── worker.rs
│   │   │   ├── worker_builder.rs
│   │   │   ├── worker_manager.rs
│   │   │   ├── worker_registry.rs
│   │   │   └── worker_service.rs
│   │   ├── lib.rs
│   │   ├── main.rs
│   │   ├── middleware.rs
│   │   ├── observability/
│   │   │   ├── events.rs
│   │   │   ├── gauge_histogram.rs
│   │   │   ├── inflight_tracker.rs
│   │   │   ├── logging.rs
│   │   │   ├── metrics.rs
│   │   │   ├── mod.rs
│   │   │   └── otel_trace.rs
│   │   ├── policies/
│   │   │   ├── bucket.rs
│   │   │   ├── cache_aware.rs
│   │   │   ├── consistent_hashing.rs
│   │   │   ├── factory.rs
│   │   │   ├── manual.rs
│   │   │   ├── mod.rs
│   │   │   ├── power_of_two.rs
│   │   │   ├── prefix_hash.rs
│   │   │   ├── random.rs
│   │   │   ├── registry.rs
│   │   │   ├── round_robin.rs
│   │   │   ├── tree.rs
│   │   │   └── utils.rs
│   │   ├── routers/
│   │   │   ├── conversations/
│   │   │   │   ├── handlers.rs
│   │   │   │   └── mod.rs
│   │   │   ├── error.rs
│   │   │   ├── factory.rs
│   │   │   ├── grpc/
│   │   │   │   ├── client.rs
│   │   │   │   ├── common/
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   ├── response_collection.rs
│   │   │   │   │   ├── response_formatting.rs
│   │   │   │   │   ├── responses/
│   │   │   │   │   │   ├── context.rs
│   │   │   │   │   │   ├── handlers.rs
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── streaming.rs
│   │   │   │   │   │   └── utils.rs
│   │   │   │   │   └── stages/
│   │   │   │   │       ├── client_acquisition.rs
│   │   │   │   │       ├── dispatch_metadata.rs
│   │   │   │   │       ├── helpers.rs
│   │   │   │   │       ├── mod.rs
│   │   │   │   │       ├── request_execution.rs
│   │   │   │   │       └── worker_selection.rs
│   │   │   │   ├── context.rs
│   │   │   │   ├── harmony/
│   │   │   │   │   ├── builder.rs
│   │   │   │   │   ├── detector.rs
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   ├── parser.rs
│   │   │   │   │   ├── processor.rs
│   │   │   │   │   ├── responses/
│   │   │   │   │   │   ├── common.rs
│   │   │   │   │   │   ├── execution.rs
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── non_streaming.rs
│   │   │   │   │   │   └── streaming.rs
│   │   │   │   │   ├── stages/
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── preparation.rs
│   │   │   │   │   │   ├── request_building.rs
│   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   ├── streaming.rs
│   │   │   │   │   └── types.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── pd_router.rs
│   │   │   │   ├── pipeline.rs
│   │   │   │   ├── proto_wrapper.rs
│   │   │   │   ├── regular/
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   ├── processor.rs
│   │   │   │   │   ├── responses/
│   │   │   │   │   │   ├── common.rs
│   │   │   │   │   │   ├── conversions.rs
│   │   │   │   │   │   ├── handlers.rs
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── non_streaming.rs
│   │   │   │   │   │   └── streaming.rs
│   │   │   │   │   ├── stages/
│   │   │   │   │   │   ├── chat/
│   │   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   │   ├── preparation.rs
│   │   │   │   │   │   │   ├── request_building.rs
│   │   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   │   ├── classify/
│   │   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   │   ├── embedding/
│   │   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   │   ├── preparation.rs
│   │   │   │   │   │   │   ├── request_building.rs
│   │   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   │   ├── generate/
│   │   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   │   ├── preparation.rs
│   │   │   │   │   │   │   ├── request_building.rs
│   │   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   │   ├── mod.rs
│   │   │   │   │   │   ├── preparation.rs
│   │   │   │   │   │   ├── request_building.rs
│   │   │   │   │   │   └── response_processing.rs
│   │   │   │   │   └── streaming.rs
│   │   │   │   ├── router.rs
│   │   │   │   └── utils.rs
│   │   │   ├── header_utils.rs
│   │   │   ├── http/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── pd_router.rs
│   │   │   │   ├── pd_types.rs
│   │   │   │   └── router.rs
│   │   │   ├── mcp_utils.rs
│   │   │   ├── mesh/
│   │   │   │   ├── handlers.rs
│   │   │   │   └── mod.rs
│   │   │   ├── mod.rs
│   │   │   ├── openai/
│   │   │   │   ├── context.rs
│   │   │   │   ├── mod.rs
│   │   │   │   ├── provider.rs
│   │   │   │   ├── responses/
│   │   │   │   │   ├── accumulator.rs
│   │   │   │   │   ├── common.rs
│   │   │   │   │   ├── mcp.rs
│   │   │   │   │   ├── mod.rs
│   │   │   │   │   ├── non_streaming.rs
│   │   │   │   │   ├── streaming.rs
│   │   │   │   │   ├── tool_handler.rs
│   │   │   │   │   └── utils.rs
│   │   │   │   └── router.rs
│   │   │   ├── parse/
│   │   │   │   ├── handlers.rs
│   │   │   │   └── mod.rs
│   │   │   ├── persistence_utils.rs
│   │   │   ├── router_manager.rs
│   │   │   └── tokenize/
│   │   │       ├── handlers.rs
│   │   │       └── mod.rs
│   │   ├── server.rs
│   │   ├── service_discovery.rs
│   │   ├── version.rs
│   │   └── wasm/
│   │       ├── mod.rs
│   │       └── route.rs
│   └── tests/
│       ├── api/
│       │   ├── api_endpoints_test.rs
│       │   ├── mod.rs
│       │   ├── parser_endpoints_test.rs
│       │   ├── request_formats_test.rs
│       │   ├── responses_api_test.rs
│       │   └── streaming_tests.rs
│       ├── api_tests.rs
│       ├── common/
│       │   ├── mock_mcp_server.rs
│       │   ├── mock_openai_server.rs
│       │   ├── mock_worker.rs
│       │   ├── mod.rs
│       │   ├── redis_test_server.rs
│       │   ├── streaming_helpers.rs
│       │   ├── test_app.rs
│       │   ├── test_certs.rs
│       │   ├── test_config.rs
│       │   └── tls_mock_worker.rs
│       ├── inflight_tracker_test.rs
│       ├── load_guard_raii_test.rs
│       ├── mcp_test.rs
│       ├── metrics_aggregator_test.rs
│       ├── otel_tracing_test.rs
│       ├── reliability/
│       │   ├── circuit_breaker_test.rs
│       │   ├── fault_tolerance_test.rs
│       │   ├── mod.rs
│       │   ├── rate_limiting_test.rs
│       │   └── retries_test.rs
│       ├── reliability_tests.rs
│       ├── routing/
│       │   ├── cache_aware_backward_compat_test.rs
│       │   ├── header_forwarding_test.rs
│       │   ├── load_balancing_test.rs
│       │   ├── manual_routing_test.rs
│       │   ├── mod.rs
│       │   ├── payload_size_test.rs
│       │   ├── pd_routing_test.rs
│       │   ├── policy_registry_integration.rs
│       │   ├── power_of_two_test.rs
│       │   ├── service_discovery_test.rs
│       │   ├── test_openai_routing.rs
│       │   ├── test_pd_routing.rs
│       │   └── worker_management_test.rs
│       ├── routing_tests.rs
│       ├── security/
│       │   ├── auth_integration_test.rs
│       │   ├── auth_test.rs
│       │   ├── mod.rs
│       │   └── mtls_test.rs
│       ├── security_tests.rs
│       ├── spec/
│       │   ├── chat_completion.rs
│       │   ├── chat_message.rs
│       │   ├── embedding.rs
│       │   ├── mod.rs
│       │   ├── rerank.rs
│       │   └── responses.rs
│       ├── spec_test.rs
│       └── wasm_test.rs
└── test/
    ├── README.md
    ├── lm_eval_configs/
    │   ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
    │   ├── NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
    │   └── Qwen3.5-397B-A17B.yaml
    ├── manual/
    │   ├── ascend/
    │   │   ├── test_ascend_deepseek_mtp.py
    │   │   ├── test_ascend_w8a8_quantization.py
    │   │   └── test_mindspore_models.py
    │   ├── cpu/
    │   │   └── test_comm.py
    │   ├── debug_utils/
    │   │   └── test_log_parser.py
    │   ├── entrypoints/
    │   │   └── http_server/
    │   │       └── test_abort_request.py
    │   ├── ep/
    │   │   ├── test_deepep_internode.py
    │   │   ├── test_deepep_intranode.py
    │   │   ├── test_deepep_low_latency.py
    │   │   ├── test_eplb.py
    │   │   ├── test_moe_deepep.py
    │   │   ├── test_moe_deepep_eval_accuracy_large.py
    │   │   ├── test_mooncake_expert_backup.py
    │   │   └── test_nixl_ep.py
    │   ├── hicache/
    │   │   ├── test_disaggregation_hicache.py
    │   │   └── test_pp_with_hicache.py
    │   ├── kv_transfer/
    │   │   └── test_mooncake_transfer_engine.py
    │   ├── lang_frontend/
    │   │   ├── test_bind_cache.py
    │   │   ├── test_choices.py
    │   │   ├── test_jump_forward.py
    │   │   ├── test_openai_backend.py
    │   │   ├── test_separate_reasoning.py
    │   │   └── test_separate_reasoning_execution.py
    │   ├── layers/
    │   │   ├── attention/
    │   │   │   └── nsa/
    │   │   │       ├── test_act_quant_triton.py
    │   │   │       ├── test_get_k_scale_triton_kernel.py
    │   │   │       └── test_index_buf_accessor.py
    │   │   └── moe/
    │   │       ├── test_moe_runners_1gpu.py
    │   │       └── test_moe_runners_4gpu.py
    │   ├── lora/
    │   │   ├── test_lora_cuda_graph.py
    │   │   ├── test_lora_llama4.py
    │   │   ├── test_lora_ops.py
    │   │   ├── test_lora_qwen3_vl.py
    │   │   ├── test_lora_spec_decoding.py
    │   │   └── test_torch_backend.py
    │   ├── models/
    │   │   ├── test_clip_models.py
    │   │   ├── test_falcon_h1_models.py
    │   │   ├── test_gme_qwen_models.py
    │   │   ├── test_grok_models.py
    │   │   ├── test_kimi_k2_models.py
    │   │   ├── test_llama4_models.py
    │   │   ├── test_mistral_large3_basic.py
    │   │   ├── test_mtp_models.py
    │   │   └── test_unsloth_models.py
    │   ├── nightly/
    │   │   ├── test_deepseek_v31_perf.py
    │   │   ├── test_deepseek_v32_perf.py
    │   │   ├── test_text_models_gsm8k_eval.py
    │   │   ├── test_text_models_perf.py
    │   │   ├── test_vlms_mmmu_eval.py
    │   │   ├── test_vlms_perf.py
    │   │   ├── test_vlms_piecewise_cuda_graph.py
    │   │   ├── test_vlms_vit_cuda_graph.py
    │   │   └── test_vlms_vit_flashinfer_cudnn.py
    │   ├── openai_server/
    │   │   └── features/
    │   │       ├── test_cache_report.py
    │   │       ├── test_continuous_usage_stats.py
    │   │       └── test_structural_tag.py
    │   ├── piecewise_cudagraph/
    │   │   └── test_disaggregation_piecewise_cuda_graph.py
    │   ├── quant/
    │   │   └── test_fp8_kvcache.py
    │   ├── test_async_dynamic_batch_tokenizer.py
    │   ├── test_async_mm_data_processor.py
    │   ├── test_config_integration.py
    │   ├── test_custom_allreduce.py
    │   ├── test_deepseek_chat_templates.py
    │   ├── test_double_sparsity.py
    │   ├── test_expert_distribution.py
    │   ├── test_expert_location_updater.py
    │   ├── test_fim_completion.py
    │   ├── test_forward_split_prefill.py
    │   ├── test_get_weights_by_name.py
    │   ├── test_health_check.py
    │   ├── test_kv_events.py
    │   ├── test_logprobs.py
    │   ├── test_mla_tp.py
    │   ├── test_modelopt.py
    │   ├── test_modelopt_fp8kvcache.py
    │   ├── test_models_from_modelscope.py
    │   ├── test_mori_transfer_engine_e2e.py
    │   ├── test_mscclpp.py
    │   ├── test_quick_allreduce.py
    │   ├── test_ray_engine.py
    │   ├── test_sagemaker_server.py
    │   ├── test_schedule_policy.py
    │   ├── test_srt_engine_with_quant_args.py
    │   ├── test_tokenizer_batch_encode.py
    │   ├── test_tokenizer_manager.py
    │   ├── test_torch_flex_attention_backend.py
    │   ├── test_torch_tp.py
    │   ├── test_tracing.py
    │   ├── test_triton_attention_rocm_mla.py
    │   ├── test_triton_moe_wna16.py
    │   ├── test_trtllm_fp8_kv_kernel.py
    │   ├── test_two_batch_overlap.py
    │   ├── test_vertex_endpoint.py
    │   ├── test_vlm_accuracy.py
    │   ├── test_wave_attention_backend.py
    │   ├── test_weight_validation.py
    │   ├── test_weight_version.py
    │   └── vlm/
    │       └── test_anthropic_vision.py
    ├── pytest.ini
    ├── registered/
    │   ├── 4-gpu-models/
    │   │   ├── test_deepseek_v3_cutedsl_4gpu.py
    │   │   ├── test_gpt_oss_4gpu.py
    │   │   ├── test_nvidia_nemotron_3_super_nvfp4.py
    │   │   ├── test_qwen35_models.py
    │   │   ├── test_qwen3_next_models.py
    │   │   └── test_qwen3_next_models_mtp.py
    │   ├── 8-gpu-models/
    │   │   ├── test_deepseek_v31.py
    │   │   ├── test_deepseek_v32.py
    │   │   ├── test_deepseek_v32_basic.py
    │   │   ├── test_deepseek_v32_cp_single_node.py
    │   │   ├── test_deepseek_v32_mtp.py
    │   │   ├── test_deepseek_v3_basic.py
    │   │   ├── test_deepseek_v3_mtp.py
    │   │   ├── test_glm_46.py
    │   │   ├── test_glm_46_fp8.py
    │   │   ├── test_gpt_oss_120b.py
    │   │   ├── test_kimi_k25.py
    │   │   ├── test_llama4.py
    │   │   ├── test_mimo_models.py
    │   │   ├── test_minimax_m25.py
    │   │   ├── test_mistral_large3.py
    │   │   ├── test_nvidia_nemotron_3_super_bf16.py
    │   │   ├── test_nvidia_nemotron_3_super_nightly.py
    │   │   ├── test_qwen35.py
    │   │   ├── test_qwen3_235b.py
    │   │   └── test_ring_2_5_1t.py
    │   ├── README.md
    │   ├── amd/
    │   │   ├── accuracy/
    │   │   │   ├── mi30x/
    │   │   │   │   ├── test_deepseek_r1_eval_amd.py
    │   │   │   │   ├── test_deepseek_v31_eval_amd.py
    │   │   │   │   ├── test_deepseek_v32_dp_eval_amd.py
    │   │   │   │   ├── test_deepseek_v32_eval_amd.py
    │   │   │   │   ├── test_deepseek_v32_mtp_eval_amd.py
    │   │   │   │   ├── test_deepseek_v32_tc_eval_amd.py
    │   │   │   │   ├── test_glm5_eval_amd.py
    │   │   │   │   ├── test_gpt_oss_eval_amd.py
    │   │   │   │   ├── test_grok1_fp8_eval_amd.py
    │   │   │   │   ├── test_grok1_int4_eval_amd.py
    │   │   │   │   ├── test_grok2_eval_amd.py
    │   │   │   │   ├── test_grok_eval_amd.py
    │   │   │   │   ├── test_gsm8k_eval_amd.py
    │   │   │   │   ├── test_kimi_k25_eval_amd.py
    │   │   │   │   ├── test_kimi_k2_eval_amd.py
    │   │   │   │   ├── test_minimax_m25_eval_amd.py
    │   │   │   │   ├── test_qwen35_eval_amd.py
    │   │   │   │   └── test_vlms_mmmu_eval_amd.py
    │   │   │   └── mi35x/
    │   │   │       ├── test_deepseek_r1_eval_mi35x.py
    │   │   │       ├── test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py
    │   │   │       ├── test_deepseek_r1_mxfp4_eval_mi35x.py
    │   │   │       ├── test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py
    │   │   │       ├── test_deepseek_v32_dp_eval_mi35x.py
    │   │   │       ├── test_deepseek_v32_eval_mi35x.py
    │   │   │       ├── test_deepseek_v32_mtp_eval_mi35x.py
    │   │   │       ├── test_glm5_eval_mi35x.py
    │   │   │       ├── test_gpt_oss_eval_mi35x.py
    │   │   │       ├── test_grok1_int4_eval_mi35x.py
    │   │   │       ├── test_grok2_eval_mi35x.py
    │   │   │       ├── test_kimi_k25_aiter_mla_eval_mi35x.py
    │   │   │       ├── test_kimi_k25_eval_mi35x.py
    │   │   │       ├── test_kimi_k25_mxfp4_eval_mi35x.py
    │   │   │       ├── test_kimi_k2_eval_mi35x.py
    │   │   │       ├── test_minimax_m25_eval_mi35x.py
    │   │   │       ├── test_qwen35_eval_mi35x.py
    │   │   │       └── test_qwen3_coder_next_eval_mi35x.py
    │   │   ├── disaggregation/
    │   │   │   ├── test_disaggregation_basic.py
    │   │   │   └── test_disaggregation_pp.py
    │   │   ├── perf/
    │   │   │   ├── mi30x/
    │   │   │   │   ├── test_deepseek_v31_perf.py
    │   │   │   │   ├── test_deepseek_v32_basic_perf_amd.py
    │   │   │   │   ├── test_deepseek_v32_mtp_perf_amd.py
    │   │   │   │   ├── test_deepseek_v3_perf.py
    │   │   │   │   ├── test_grok1_fp8_perf.py
    │   │   │   │   ├── test_grok1_int4_perf.py
    │   │   │   │   ├── test_grok2_perf.py
    │   │   │   │   ├── test_text_models_perf_amd.py
    │   │   │   │   └── test_vlms_perf_amd.py
    │   │   │   └── mi35x/
    │   │   │       ├── test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py
    │   │   │       ├── test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py
    │   │   │       ├── test_deepseek_r1_mxfp4_perf_mi35x.py
    │   │   │       ├── test_deepseek_v32_basic_perf_mi35x.py
    │   │   │       ├── test_deepseek_v32_mtp_perf_mi35x.py
    │   │   │       ├── test_grok1_int4_perf_mi35x.py
    │   │   │       └── test_grok2_perf_mi35x.py
    │   │   ├── test_deepseek_r1_mxfp4_8gpu.py
    │   │   ├── test_deepseek_v32_basic.py
    │   │   ├── test_deepseek_v32_mtp.py
    │   │   ├── test_deepseek_v3_basic.py
    │   │   ├── test_deepseek_v3_basic_kv_fp8.py
    │   │   ├── test_deepseek_v3_mtp.py
    │   │   ├── test_deepseek_v3_mtp_kv_fp8.py
    │   │   ├── test_kimi_k25_mxfp4.py
    │   │   ├── test_kimi_k2_instruct.py
    │   │   ├── test_moriep_small.py
    │   │   ├── test_qwen3_coder_next_8gpu.py
    │   │   ├── test_qwen3_instruct.py
    │   │   ├── test_qwen3_instruct_fp8.py
    │   │   ├── test_qwen3_instruct_mxfp4.py
    │   │   └── test_zimage_turbo.py
    │   ├── ascend/
    │   │   ├── basic_function/
    │   │   │   ├── HiCache/
    │   │   │   │   ├── test_npu_hierarchical_cache.py
    │   │   │   │   ├── test_npu_hierarchical_cache_mla.py
    │   │   │   │   ├── test_npu_hierarchical_cache_mutually_exclusive.py
    │   │   │   │   ├── test_npu_hierarchical_cache_ttft_mha.py
    │   │   │   │   └── test_npu_radix_cache.py
    │   │   │   ├── parallel_strategy/
    │   │   │   │   └── expert_parallelism/
    │   │   │   │       ├── test_npu_deepep_auto_deepseek_v3_2_w8a8.py
    │   │   │   │       ├── test_npu_deepep_auto_qwen3_480b.py
    │   │   │   │       ├── test_npu_deepep_auto_qwen3_next.py
    │   │   │   │       ├── test_npu_deepep_low_latency_deepseek_v3_2_w8a8.py
    │   │   │   │       ├── test_npu_deepep_low_latency_qwen3_480b.py
    │   │   │   │       └── test_npu_deepep_low_latency_qwen3_next.py
    │   │   │   ├── parameter/
    │   │   │   │   ├── deepseek_coder.json
    │   │   │   │   ├── test_npu_fim_completion.py
    │   │   │   │   ├── test_npu_log_level.py
    │   │   │   │   ├── test_npu_no_chunked_prefill.py
    │   │   │   │   ├── test_npu_no_overlap_scheduler.py
    │   │   │   │   ├── test_npu_original_logprobs.py
    │   │   │   │   └── test_npu_warmups.py
    │   │   │   └── speculative_inference/
    │   │   │       └── test_npu_eagle3.py
    │   │   ├── embedding_models/
    │   │   │   └── test_npu_bge_large_en_v1_5.py
    │   │   ├── interface/
    │   │   │   ├── test_npu_api.py
    │   │   │   ├── test_npu_api_abort_request.py
    │   │   │   ├── test_npu_api_encode.py
    │   │   │   ├── test_npu_enable_thinking.py
    │   │   │   ├── test_npu_matched_stop.py
    │   │   │   ├── test_npu_openai_function_calling.py
    │   │   │   ├── test_npu_openai_server_ignore_eos.py
    │   │   │   └── test_npu_penalty.py
    │   │   ├── llm_models/
    │   │   │   ├── test_npu_afm_4_5b.py
    │   │   │   ├── test_npu_baichuan2_13b_chat.py
    │   │   │   ├── test_npu_c4ai_command_r_v01.py
    │   │   │   ├── test_npu_chatglm2_6b.py
    │   │   │   ├── test_npu_deepseek_v3_2_exp_w8a8.py
    │   │   │   ├── test_npu_exaone_3.py
    │   │   │   ├── test_npu_gemma_3_4b_it_llm.py
    │   │   │   ├── test_npu_glm4_9b_chat.py
    │   │   │   ├── test_npu_granite_3_0_3b_a800m.py
    │   │   │   ├── test_npu_granite_3_1_8b.py
    │   │   │   ├── test_npu_grok_2.py
    │   │   │   ├── test_npu_internlm2_7b.py
    │   │   │   ├── test_npu_ling_lite.py
    │   │   │   ├── test_npu_llama4_scount_17b_16e.py
    │   │   │   ├── test_npu_llama_2_7b.py
    │   │   │   ├── test_npu_mimo_7b_rl.py
    │   │   │   ├── test_npu_minicpm3_4b.py
    │   │   │   ├── test_npu_mistral_7b.py
    │   │   │   ├── test_npu_persimmon_8b_chat.py
    │   │   │   ├── test_npu_phi_4_multimodal_llm.py
    │   │   │   ├── test_npu_qwen3_0_6b.py
    │   │   │   ├── test_npu_qwen3_1_7b_gptq_int8.py
    │   │   │   ├── test_npu_qwen3_235b_a22b_w8a8.py
    │   │   │   ├── test_npu_qwen3_30b.py
    │   │   │   ├── test_npu_qwen3_30b_w4a4.py
    │   │   │   ├── test_npu_qwen3_32b.py
    │   │   │   ├── test_npu_qwen3_coder_480b_a35b.py
    │   │   │   ├── test_npu_qwq_32b_w8a8.py
    │   │   │   ├── test_npu_smollm_1_7b.py
    │   │   │   ├── test_npu_stablelm_2_1_6b.py
    │   │   │   └── tool_chat_template_c4ai_command_r_v01.jinja
    │   │   ├── rerank_models/
    │   │   │   └── test_npu_bge_reranker_v2_m3.py
    │   │   ├── reward_models/
    │   │   │   ├── test_npu_gemma_2_27b_v0_2.py
    │   │   │   ├── test_npu_internlm2_7b_reward.py
    │   │   │   └── test_npu_llama_3_1_8b_v0_2.py
    │   │   ├── test_npu_memory_consumption.py
    │   │   └── vlm_models/
    │   │       ├── mmmu-val.yaml
    │   │       ├── test_npu_deepseek_vl2.py
    │   │       ├── test_npu_gemma_3_4b_it.py
    │   │       ├── test_npu_janus_pro_1b.py
    │   │       ├── test_npu_janus_pro_7b.py
    │   │       ├── test_npu_kimi_vl_a3b_instruct.py
    │   │       ├── test_npu_llama_3_2_11b_vision_instruct.py
    │   │       ├── test_npu_mimo_vl_7b_rl.py
    │   │       ├── test_npu_minicpm_o_2_6.py
    │   │       ├── test_npu_minicpm_v_2_6.py
    │   │       ├── test_npu_mistral_small_3_1_24b_instruct_2503.py
    │   │       ├── test_npu_phi4_multimodal_instruct.py
    │   │       ├── test_npu_qwen2_5_vl_3b_instruct.py
    │   │       ├── test_npu_qwen2_5_vl_72b_instruct.py
    │   │       ├── test_npu_qwen3_vl_235b_a22b_instruct.py
    │   │       ├── test_npu_qwen3_vl_30b_a3b_instruct.py
    │   │       ├── test_npu_qwen3_vl_4b_instruct.py
    │   │       └── test_npu_qwen3_vl_8b_instruct.py
    │   ├── attention/
    │   │   ├── test_chunk_gated_delta_rule.py
    │   │   ├── test_create_kvindices.py
    │   │   ├── test_fa3.py
    │   │   ├── test_flash_attention_4.py
    │   │   ├── test_hybrid_attn_backend.py
    │   │   ├── test_kda_kernels.py
    │   │   ├── test_local_attn.py
    │   │   ├── test_torch_native_attention_backend.py
    │   │   ├── test_triton_attention_backend.py
    │   │   ├── test_triton_attention_kernels.py
    │   │   ├── test_triton_sliding_window.py
    │   │   └── test_wave_attention_kernels.py
    │   ├── backends/
    │   │   ├── test_deepseek_r1_fp8_trtllm_backend.py
    │   │   ├── test_deepseek_v3_fp4_cutlass_moe.py
    │   │   ├── test_flashinfer_trtllm_gen_attn_backend.py
    │   │   ├── test_flashinfer_trtllm_gen_moe_backend.py
    │   │   ├── test_qwen3_fp4_trtllm_gen_moe.py
    │   │   └── test_torch_compile.py
    │   ├── bench_fn/
    │   │   ├── test_bench_serving_functionality.py
    │   │   └── test_benchmark_datasets_api.py
    │   ├── constrained_decoding/
    │   │   └── test_constrained_decoding.py
    │   ├── core/
    │   │   ├── test_cpp_radix_cache.py
    │   │   ├── test_deepseek_v3_deterministic.py
    │   │   ├── test_deterministic.py
    │   │   ├── test_gpt_oss_1gpu.py
    │   │   ├── test_gpt_oss_sm120.py
    │   │   ├── test_hidden_states.py
    │   │   ├── test_page_size.py
    │   │   ├── test_qwen3_next_deterministic.py
    │   │   ├── test_request_queue_validation.py
    │   │   ├── test_score_api.py
    │   │   ├── test_srt_endpoint.py
    │   │   └── test_srt_engine.py
    │   ├── debug_utils/
    │   │   ├── comparator/
    │   │   │   ├── __init__.py
    │   │   │   ├── aligner/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── conftest.py
    │   │   │   │   ├── entrypoint/
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── conftest.py
    │   │   │   │   │   ├── test_executor.py
    │   │   │   │   │   └── test_planner.py
    │   │   │   │   ├── reorderer/
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── conftest.py
    │   │   │   │   │   ├── test_executor.py
    │   │   │   │   │   └── test_planner.py
    │   │   │   │   ├── test_axis_aligner.py
    │   │   │   │   ├── token_aligner/
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── conftest.py
    │   │   │   │   │   ├── test_aux_loader.py
    │   │   │   │   │   ├── test_aux_plugins.py
    │   │   │   │   │   ├── test_concat_steps.py
    │   │   │   │   │   ├── test_executor.py
    │   │   │   │   │   ├── test_planner.py
    │   │   │   │   │   └── test_thd_seq_lens_loader.py
    │   │   │   │   └── unsharder/
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── conftest.py
    │   │   │   │       ├── test_executor.py
    │   │   │   │       ├── test_parallel_info.py
    │   │   │   │       └── test_planner.py
    │   │   │   ├── conftest.py
    │   │   │   ├── dims_spec/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_dim_parser.py
    │   │   │   │   ├── test_dims_parser.py
    │   │   │   │   ├── test_tensor_naming.py
    │   │   │   │   └── test_types.py
    │   │   │   ├── tensor_comparator/
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── conftest.py
    │   │   │   │   ├── test_comparator.py
    │   │   │   │   ├── test_formatter.py
    │   │   │   │   └── test_types.py
    │   │   │   ├── test_bundle_comparator.py
    │   │   │   ├── test_bundle_matcher.py
    │   │   │   ├── test_display.py
    │   │   │   ├── test_dp_utils.py
    │   │   │   ├── test_dump_loader.py
    │   │   │   ├── test_entrypoint.py
    │   │   │   ├── test_log_sink.py
    │   │   │   ├── test_manually_verify.py
    │   │   │   ├── test_meta_overrider.py
    │   │   │   ├── test_model_validation.py
    │   │   │   ├── test_output_types.py
    │   │   │   ├── test_per_token_visualizer.py
    │   │   │   ├── test_preset.py
    │   │   │   ├── test_utils.py
    │   │   │   ├── test_visualizer.py
    │   │   │   └── testing_helpers.py
    │   │   ├── source_patcher/
    │   │   │   ├── conftest.py
    │   │   │   ├── test_code_patcher.py
    │   │   │   ├── test_dumper_integration.py
    │   │   │   └── test_source_editor.py
    │   │   ├── test_crash_dump.py
    │   │   ├── test_cuda_coredump_smoke.py
    │   │   ├── test_dump_comparator.py
    │   │   ├── test_dump_loader.py
    │   │   ├── test_dumper.py
    │   │   ├── test_engine_dumper_comparator_e2e.py
    │   │   ├── test_schedule_simulator.py
    │   │   ├── test_soft_watchdog.py
    │   │   └── test_tensor_dump_forward_hook.py
    │   ├── disaggregation/
    │   │   ├── test_disaggregation_basic.py
    │   │   ├── test_disaggregation_decode_offload.py
    │   │   └── test_specv2_kvcache_offloading.py
    │   ├── distributed/
    │   │   ├── test_data_parallelism.py
    │   │   ├── test_disaggregation_aarch64.py
    │   │   ├── test_disaggregation_different_tp.py
    │   │   ├── test_disaggregation_dp_attention.py
    │   │   ├── test_disaggregation_hybrid_attention.py
    │   │   ├── test_disaggregation_pp.py
    │   │   ├── test_dp_attention.py
    │   │   ├── test_dp_attention_large.py
    │   │   ├── test_epd_disaggregation.py
    │   │   ├── test_load_weights_from_remote_instance.py
    │   │   ├── test_load_weights_from_remote_instance_npu.py
    │   │   ├── test_parallel_state.py
    │   │   └── test_pp_single_node.py
    │   ├── dllm/
    │   │   ├── test_llada2_mini.py
    │   │   └── test_llada2_mini_amd.py
    │   ├── embedding/
    │   │   ├── test_embedding_models.py
    │   │   ├── test_encoder_embedding_models.py
    │   │   ├── test_input_embeddings.py
    │   │   ├── test_input_embeds_chunked.py
    │   │   └── test_openai_embedding.py
    │   ├── ep/
    │   │   ├── test_deepep_large.py
    │   │   ├── test_deepep_small.py
    │   │   └── test_mooncake_ep_small.py
    │   ├── eval/
    │   │   ├── test_eval_accuracy_large.py
    │   │   ├── test_moe_eval_accuracy_large.py
    │   │   ├── test_text_models_gsm8k_eval.py
    │   │   └── test_vlms_mmmu_eval.py
    │   ├── function_call/
    │   │   └── test_kimik2_detector.py
    │   ├── hicache/
    │   │   ├── test_hicache_storage.py
    │   │   ├── test_hicache_storage_3fs_backend.py
    │   │   ├── test_hicache_storage_file_backend.py
    │   │   ├── test_hicache_storage_mooncake_backend.py
    │   │   ├── test_hicache_storage_runtime_attach_detach.py
    │   │   └── test_hicache_variants.py
    │   ├── kernels/
    │   │   ├── test_fp4_moe.py
    │   │   ├── test_fused_topk_deepseek.py
    │   │   └── test_nsa_indexer.py
    │   ├── layers/
    │   │   ├── mamba/
    │   │   │   ├── conftest.py
    │   │   │   ├── test_causal_conv1d.py
    │   │   │   ├── test_mamba2_mixer.py
    │   │   │   ├── test_mamba_ssm.py
    │   │   │   └── test_mamba_ssm_ssd.py
    │   │   └── test_fla_layernorm_guard.py
    │   ├── lora/
    │   │   ├── test_chunked_sgmv_backend.py
    │   │   ├── test_embedding_lora_support.py
    │   │   ├── test_fused_moe_lora_kernel.py
    │   │   ├── test_lora_backend.py
    │   │   ├── test_lora_eviction.py
    │   │   ├── test_lora_eviction_policy.py
    │   │   ├── test_lora_hf_sgl_logprob_diff.py
    │   │   ├── test_lora_openai_api.py
    │   │   ├── test_lora_openai_compatible.py
    │   │   ├── test_lora_overlap_loading.py
    │   │   ├── test_lora_qwen3.py
    │   │   ├── test_lora_radix_cache.py
    │   │   ├── test_lora_tied_lm_head.py
    │   │   ├── test_lora_tp.py
    │   │   ├── test_lora_update.py
    │   │   └── test_multi_lora_backend.py
    │   ├── metrics/
    │   │   ├── test_metrics.py
    │   │   └── test_priority_metrics.py
    │   ├── mla/
    │   │   ├── test_flashmla.py
    │   │   ├── test_mla.py
    │   │   ├── test_mla_deepseek_v3.py
    │   │   ├── test_mla_flashinfer.py
    │   │   ├── test_mla_fp8.py
    │   │   └── test_mla_int8_deepseek_v3.py
    │   ├── model_loading/
    │   │   ├── test_external_models.py
    │   │   └── test_utils_update_weights.py
    │   ├── models/
    │   │   ├── test_compressed_tensors_models.py
    │   │   ├── test_cross_encoder_models.py
    │   │   ├── test_dummy_grok_models.py
    │   │   ├── test_generation_models.py
    │   │   ├── test_gpt_oss_models_pcg.py
    │   │   ├── test_kimi_linear_models.py
    │   │   ├── test_kimi_linear_models_pcg.py
    │   │   ├── test_ministral3_models.py
    │   │   ├── test_nvidia_nemotron_3_nano.py
    │   │   ├── test_nvidia_nemotron_nano_v2.py
    │   │   ├── test_nvidia_nemotron_nano_v2_vl.py
    │   │   ├── test_qwen3_next_models_fp4.py
    │   │   ├── test_qwen3_next_models_pcg.py
    │   │   ├── test_qwen_models.py
    │   │   ├── test_reward_models.py
    │   │   ├── test_transformers_models.py
    │   │   └── test_vlm_models.py
    │   ├── moe/
    │   │   ├── test_cutedsl_moe.py
    │   │   ├── test_fused_moe.py
    │   │   ├── test_glm4_moe_models.py
    │   │   ├── test_moe_ep.py
    │   │   ├── test_torch_compile_moe.py
    │   │   ├── test_triton_fused_moe.py
    │   │   └── test_triton_moe_channel_fp8_kernel.py
    │   ├── openai_server/
    │   │   ├── basic/
    │   │   │   ├── test_anthropic_server.py
    │   │   │   ├── test_openai_server.py
    │   │   │   ├── test_protocol.py
    │   │   │   ├── test_serving_chat.py
    │   │   │   ├── test_serving_completions.py
    │   │   │   └── test_serving_rerank.py
    │   │   ├── features/
    │   │   │   ├── test_enable_thinking.py
    │   │   │   ├── test_json_mode.py
    │   │   │   ├── test_openai_server_ebnf.py
    │   │   │   ├── test_openai_server_hidden_states.py
    │   │   │   └── test_reasoning_content.py
    │   │   ├── function_call/
    │   │   │   ├── test_anthropic_tool_use.py
    │   │   │   ├── test_openai_function_calling.py
    │   │   │   └── test_tool_choice.py
    │   │   └── validation/
    │   │       ├── test_large_max_new_tokens.py
    │   │       ├── test_matched_stop.py
    │   │       ├── test_openai_server_ignore_eos.py
    │   │       └── test_request_length_validation.py
    │   ├── ops/
    │   │   ├── test_aiter_allreduce_fusion_amd.py
    │   │   └── test_repeat_interleave.py
    │   ├── perf/
    │   │   ├── test_bench_one_batch_1gpu.py
    │   │   ├── test_bench_one_batch_2gpu.py
    │   │   ├── test_bench_serving_1gpu_large.py
    │   │   ├── test_bench_serving_1gpu_part1.py
    │   │   ├── test_bench_serving_1gpu_part2.py
    │   │   ├── test_bench_serving_2gpu.py
    │   │   ├── test_dpsk_r1_fp4_4gpu_perf.py
    │   │   ├── test_gpt_oss_4gpu_perf.py
    │   │   ├── test_text_models_perf.py
    │   │   ├── test_vlm_perf_5090.py
    │   │   └── test_vlms_perf.py
    │   ├── piecewise_cuda_graph/
    │   │   └── test_piecewise_cuda_graph_support_1_gpu.py
    │   ├── profiling/
    │   │   ├── test_profile_v2.py
    │   │   └── test_start_profile.py
    │   ├── quant/
    │   │   ├── test_autoround.py
    │   │   ├── test_awq.py
    │   │   ├── test_awq_dequant.py
    │   │   ├── test_block_int8.py
    │   │   ├── test_bnb.py
    │   │   ├── test_deepseek_v32_fp4_4gpu.py
    │   │   ├── test_deepseek_v32_fp4_mtp_4gpu.py
    │   │   ├── test_deepseek_v3_fp4_4gpu.py
    │   │   ├── test_eval_fp8_accuracy.py
    │   │   ├── test_fp8_blockwise_gemm.py
    │   │   ├── test_fp8_kernel.py
    │   │   ├── test_fp8_utils.py
    │   │   ├── test_fp8kv_triton.py
    │   │   ├── test_fused_rms_fp8_group_quant.py
    │   │   ├── test_gguf.py
    │   │   ├── test_gptqmodel_dynamic.py
    │   │   ├── test_int4fp8_moe.py
    │   │   ├── test_int8_kernel.py
    │   │   ├── test_marlin_moe.py
    │   │   ├── test_modelopt_fp8.py
    │   │   ├── test_nvfp4_gemm.py
    │   │   ├── test_quant_config_parsing.py
    │   │   ├── test_quantization.py
    │   │   ├── test_torchao.py
    │   │   ├── test_triton_scaled_mm.py
    │   │   ├── test_w4a8_deepseek_v3.py
    │   │   └── test_w8a8_quantization.py
    │   ├── radix_cache/
    │   │   ├── test_radix_attention.py
    │   │   ├── test_radix_cache_hit.py
    │   │   └── test_swa_radix_cache_kl.py
    │   ├── rl/
    │   │   ├── test_fp32_lm_head.py
    │   │   ├── test_lora_load_from_tensor.py
    │   │   ├── test_multi_instance_release_memory_occupation.py
    │   │   ├── test_patch_torch.py
    │   │   ├── test_release_memory_occupation.py
    │   │   ├── test_return_routed_experts.py
    │   │   ├── test_update_weights_from_disk.py
    │   │   ├── test_update_weights_from_distributed.py
    │   │   └── test_update_weights_from_tensor.py
    │   ├── rotary/
    │   │   ├── test_mrope.py
    │   │   └── test_rope_rocm.py
    │   ├── sampling/
    │   │   ├── test_original_logprobs.py
    │   │   ├── test_penalty.py
    │   │   └── test_pytorch_sampling_backend.py
    │   ├── scheduler/
    │   │   ├── test_abort.py
    │   │   ├── test_chunked_prefill.py
    │   │   ├── test_no_chunked_prefill.py
    │   │   ├── test_no_overlap_scheduler.py
    │   │   ├── test_prefill_delayer.py
    │   │   ├── test_priority_scheduling.py
    │   │   ├── test_retract_decode.py
    │   │   └── test_routing_key_scheduling.py
    │   ├── sessions/
    │   │   ├── test_session_control.py
    │   │   ├── test_session_latency.py
    │   │   └── test_streaming_session.py
    │   ├── spec/
    │   │   ├── eagle/
    │   │   │   ├── test_deepseek_v3_fp4_mtp_small.py
    │   │   │   ├── test_eagle3_basic.py
    │   │   │   ├── test_eagle_constrained_decoding.py
    │   │   │   ├── test_eagle_dp_attention.py
    │   │   │   ├── test_eagle_infer_a.py
    │   │   │   ├── test_eagle_infer_b.py
    │   │   │   ├── test_eagle_infer_beta.py
    │   │   │   ├── test_eagle_infer_beta_dp_attention.py
    │   │   │   └── test_eagle_infer_beta_dp_attention_large.py
    │   │   ├── test_constrained_decoding_spec_reasoning.py
    │   │   ├── test_ngram_speculative_decoding.py
    │   │   ├── test_standalone_speculative_decoding.py
    │   │   └── utils/
    │   │       └── test_build_eagle_tree.py
    │   ├── stress/
    │   │   ├── test_stress_deepseek_v3.py
    │   │   ├── test_stress_glm_4_6.py
    │   │   ├── test_stress_kimi_k2.py
    │   │   └── test_stress_qwen3_235b.py
    │   ├── test_hybrid_dp_ep_tp_mtp.py
    │   ├── test_srt_backend.py
    │   ├── tokenizer/
    │   │   ├── test_multi_tokenizer.py
    │   │   └── test_skip_tokenizer_init.py
    │   ├── unit/
    │   │   ├── README.md
    │   │   ├── batch_invariant_ops/
    │   │   │   └── test_batch_invariant_ops.py
    │   │   ├── entrypoints/
    │   │   │   ├── openai/
    │   │   │   │   └── test_serving_embedding.py
    │   │   │   └── test_ssl_cert_refresher.py
    │   │   ├── function_call/
    │   │   │   ├── test_function_call_parser.py
    │   │   │   ├── test_glm47_moe_detector.py
    │   │   │   ├── test_json_schema_constraint.py
    │   │   │   ├── test_parallel_tool_calls.py
    │   │   │   └── test_unknown_tool_name.py
    │   │   ├── layers/
    │   │   │   ├── test_conv_layer.py
    │   │   │   └── test_mamba_state_scatter_triton.py
    │   │   ├── managers/
    │   │   │   ├── test_io_struct.py
    │   │   │   ├── test_prefill_adder.py
    │   │   │   └── test_profile_merger_http_api.py
    │   │   ├── mem_cache/
    │   │   │   ├── test_evict_policy.py
    │   │   │   ├── test_mamba_unittest.py
    │   │   │   ├── test_nsa_pool_host_unit.py
    │   │   │   ├── test_radix_cache_slru_accuracy.py
    │   │   │   ├── test_radix_cache_unit.py
    │   │   │   └── test_swa_unittest.py
    │   │   ├── model_executor/
    │   │   │   └── test_model_hooks.py
    │   │   ├── model_loader/
    │   │   │   ├── test_modelopt_export.py
    │   │   │   └── test_modelopt_loader.py
    │   │   ├── observability/
    │   │   │   ├── test_cpu_monitor.py
    │   │   │   └── test_metrics_utils.py
    │   │   ├── parser/
    │   │   │   ├── test_harmony_parser.py
    │   │   │   ├── test_jinja_template_utils.py
    │   │   │   └── test_reasoning_parser.py
    │   │   ├── server_args/
    │   │   │   └── test_server_args.py
    │   │   └── utils/
    │   │       ├── test_gauge_histogram.py
    │   │       ├── test_json_response.py
    │   │       ├── test_patch_tokenizer.py
    │   │       └── test_profile_merger.py
    │   ├── utils/
    │   │   ├── test_bench_typebaseddispatcher.py
    │   │   ├── test_log_utils.py
    │   │   ├── test_model_file_verifier.py
    │   │   ├── test_network_address.py
    │   │   ├── test_request_logger.py
    │   │   ├── test_scheduler_status_logger.py
    │   │   ├── test_socket_utils.py
    │   │   └── test_type_based_dispatcher.py
    │   └── vlm/
    │       ├── test_encoder_dp.py
    │       ├── test_evs.py
    │       ├── test_patch_embed_perf.py
    │       ├── test_video_utils.py
    │       ├── test_vision_chunked_prefill.py
    │       ├── test_vision_openai_server_a.py
    │       └── test_vlm_input_format.py
    ├── run_suite.py
    ├── run_suite_nightly.py
    ├── show_partitions.py
    └── srt/
        ├── ascend/
        │   ├── test_ascend_autoround_dense.py
        │   ├── test_ascend_autoround_moe.py
        │   ├── test_ascend_compile_graph_tp1_bf16.py
        │   ├── test_ascend_deepep.py
        │   ├── test_ascend_gptq_moe.py
        │   ├── test_ascend_graph_tp1_bf16.py
        │   ├── test_ascend_graph_tp2_bf16.py
        │   ├── test_ascend_hicache_mha.py
        │   ├── test_ascend_hicache_mla.py
        │   ├── test_ascend_mla_fia_w8a8int8.py
        │   ├── test_ascend_mla_w8a8int8.py
        │   ├── test_ascend_piecewise_graph_prefill.py
        │   ├── test_ascend_sampling_backend.py
        │   ├── test_ascend_tp1_bf16.py
        │   ├── test_ascend_tp2_bf16.py
        │   ├── test_ascend_tp2_fia_bf16.py
        │   ├── test_ascend_tp4_bf16.py
        │   ├── test_ascend_w4a4_quantization.py
        │   ├── test_ascend_w8a8_quantization.py
        │   └── test_llada2_mini_ascend.py
        ├── configs/
        │   ├── deepseek_v3.yaml
        │   ├── deepseek_v3_long_context.yaml
        │   ├── llama_405b.yaml
        │   ├── random_config.yaml
        │   ├── random_flashinfer_vs_triton_config.yaml
        │   └── sharegpt_config.yaml
        ├── cpu/
        │   ├── test_activation.py
        │   ├── test_binding.py
        │   ├── test_bmm.py
        │   ├── test_causal_conv1d.py
        │   ├── test_cpu_graph.py
        │   ├── test_decode.py
        │   ├── test_extend.py
        │   ├── test_flash_attn.py
        │   ├── test_gemm.py
        │   ├── test_intel_amx_attention_backend_a.py
        │   ├── test_intel_amx_attention_backend_b.py
        │   ├── test_intel_amx_attention_backend_c.py
        │   ├── test_mamba.py
        │   ├── test_mla.py
        │   ├── test_moe.py
        │   ├── test_norm.py
        │   ├── test_qkv_proj_with_rope.py
        │   ├── test_qwen3.py
        │   ├── test_rope.py
        │   ├── test_shared_expert.py
        │   ├── test_topk.py
        │   └── utils.py
        ├── double-sparsity-config-Llama-3.1-8B-Instruct.json
        ├── experiment_runner.py
        ├── kv_cache_scales_llama3_1_8b.json
        ├── kv_cache_scales_llama3_8b.json
        ├── kv_cache_scales_qwen2_1_5b.json
        ├── models/
        │   └── compare.py
        ├── parse_results.py
        ├── run_suite.py
        ├── test_embed_interpolate_unittest.py
        └── xpu/
            ├── test_deepseek_ocr.py
            └── test_intel_xpu_backend.py