Copy disabled (too large)
Download .txt
Showing preview only (16,705K chars total). Download the full file to get everything.
Repository: sgl-project/sglang
Branch: main
Commit: c82d20d48ecc
Files: 4115
Total size: 40.1 MB
Directory structure:
gitextract_8m4rfrdr/
├── .claude/
│ └── skills/
│ ├── add-jit-kernel/
│ │ └── SKILL.md
│ ├── add-sgl-kernel/
│ │ └── SKILL.md
│ ├── sglang-bisect-ci-regression/
│ │ └── SKILL.md
│ └── write-sglang-test/
│ └── SKILL.md
├── .codespellrc
├── .coveragerc
├── .devcontainer/
│ ├── Dockerfile
│ └── devcontainer.json
├── .github/
│ ├── CI_PERMISSIONS.json
│ ├── CODEOWNERS
│ ├── FOLDER_README.md
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1-bug-report.yml
│ │ └── 2-feature-request.yml
│ ├── MAINTAINER.md
│ ├── actions/
│ │ ├── upload-cuda-coredumps/
│ │ │ └── action.yml
│ │ └── wait-for-jobs/
│ │ └── action.yml
│ ├── labeler.yml
│ ├── pull_request_template.md
│ ├── update_ci_permission.py
│ └── workflows/
│ ├── amd-aiter-scout.yml
│ ├── amd-ci-job-monitor.yml
│ ├── auto-tune.yml
│ ├── bot-bump-flashinfer-version.yml
│ ├── bot-bump-kernel-version-to-sglang.yml
│ ├── bot-bump-kernel-version.yml
│ ├── bot-bump-sglang-version.yml
│ ├── bot-cherry-pick.yml
│ ├── cancel-pr-workflow-on-merge.yml
│ ├── cancel-unfinished-pr-tests.yml
│ ├── ci-coverage-overview.yml
│ ├── ci-failure-monitor.yml
│ ├── close-inactive-issues.yml
│ ├── diffusion-ci-gt-gen.yml
│ ├── execute-notebook.yml
│ ├── labeler.yml
│ ├── lint.yml
│ ├── list-active-pr-runs.yml.yml
│ ├── nightly-release-gateway.yml
│ ├── nightly-test-amd-rocm720.yml
│ ├── nightly-test-amd.yml
│ ├── nightly-test-intel.yml
│ ├── nightly-test-npu.yml
│ ├── nightly-test-nvidia.yml
│ ├── open-pr-copy-from-oss.yml
│ ├── open-pr-copy-to-oss.yml
│ ├── patch-docker-dev.yml
│ ├── pr-benchmark-rust.yml
│ ├── pr-gate.yml
│ ├── pr-test-amd-rocm720.yml
│ ├── pr-test-amd.yml
│ ├── pr-test-npu.yml
│ ├── pr-test-rust.yml
│ ├── pr-test-xeon.yml
│ ├── pr-test-xpu.yml
│ ├── pr-test.yml
│ ├── release-branch-cut.yml
│ ├── release-docker-amd-nightly.yml
│ ├── release-docker-amd-rocm720-nightly.yml
│ ├── release-docker-amd.yml
│ ├── release-docker-cu13-framework.yml
│ ├── release-docker-dev.yml
│ ├── release-docker-gateway.yml
│ ├── release-docker-npu-nightly.yml
│ ├── release-docker-npu.yml
│ ├── release-docker-xeon.yml
│ ├── release-docker.yml
│ ├── release-docs.yml
│ ├── release-pypi-gateway.yml
│ ├── release-pypi-nightly.yml
│ ├── release-pypi-pr.yml
│ ├── release-pypi.yml
│ ├── release-tag.yml
│ ├── release-whl-kernel.yml
│ ├── rerun-ut.yml
│ ├── retag-docker.yml
│ ├── runner-utilization.yml
│ ├── slash-command-handler.yml
│ ├── stress-test.yml
│ └── weekly-test-nvidia.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── 3rdparty/
│ └── amd/
│ ├── profiling/
│ │ ├── PROFILING.md
│ │ ├── client.sh
│ │ ├── install_rpd.sh
│ │ ├── loadTracer.sh
│ │ ├── rpd.patch
│ │ ├── rpd_profile_server_enable.patch
│ │ ├── rpd_profile_server_enable_wCPU_activities.patch
│ │ ├── server.sh
│ │ └── torch_profiler.patch
│ ├── tuning/
│ │ ├── TUNING.md
│ │ └── benchmark_moe_rocm.py
│ └── wheel/
│ ├── README.md
│ ├── sgl-kernel/
│ │ ├── CMakeLists_rocm.txt
│ │ ├── build_rocm.sh
│ │ ├── rename_wheels_rocm.sh
│ │ └── rocm_hipify.py
│ └── sglang/
│ └── pyproject.toml
├── LICENSE
├── README.md
├── benchmark/
│ ├── asr/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── bench_attention_sink/
│ │ └── bench_attention_sink_triton.py
│ ├── bench_in_batch_prefix/
│ │ └── bench_in_batch_prefix.py
│ ├── bench_linear_attention/
│ │ ├── bench_gdn_decode.py
│ │ └── bench_gdn_prefill.py
│ ├── bench_rope/
│ │ └── benchmark_rope_index.py
│ ├── benchmark_batch/
│ │ ├── benchmark_batch.py
│ │ └── benchmark_tokenizer.py
│ ├── benchmark_vllm_060/
│ │ └── README.md
│ ├── blog_v0_2/
│ │ ├── 405b_sglang.sh
│ │ ├── 405b_trt.sh
│ │ ├── 405b_vllm.sh
│ │ ├── README.md
│ │ └── config.md
│ ├── boolq/
│ │ ├── README.md
│ │ ├── bench_sglang.py
│ │ ├── convert_parquet_to_json.py
│ │ └── parquet_to_json.sh
│ ├── ceval/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── deepseek_v3/
│ │ └── README.md
│ ├── dspy/
│ │ ├── README.md
│ │ └── bench_dspy_intro.py
│ ├── fla/
│ │ └── benchmark_layernorm_gated.py
│ ├── generative_agents/
│ │ ├── README.md
│ │ ├── agent_functions.py
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── gpt_oss/
│ │ └── README.md
│ ├── gsm8k/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── hellaswag/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── hf3fs/
│ │ ├── bench.sh
│ │ ├── bench_client.py
│ │ ├── bench_storage.py
│ │ └── bench_zerocopy.py
│ ├── hicache/
│ │ ├── README.md
│ │ ├── bench_long_context.py
│ │ ├── bench_mix.py
│ │ ├── bench_mix.sh
│ │ ├── bench_multiturn.py
│ │ ├── bench_serving.py
│ │ ├── data_processing.py
│ │ ├── download.sh
│ │ ├── nextqa.py
│ │ └── perf.py
│ ├── json_decode_regex/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── json_jump_forward/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── build_dataset.py
│ │ └── dataset.txt
│ ├── json_schema/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── kernels/
│ │ ├── all_reduce/
│ │ │ ├── benchmark_aiter.py
│ │ │ ├── benchmark_all_reduce.py
│ │ │ ├── benchmark_fused_ar_rms_amd.py
│ │ │ ├── benchmark_mscclpp.py
│ │ │ └── benchmark_torch_symm_mem.py
│ │ ├── decoding_attention_triton/
│ │ │ └── triton_flashinfer_cudnn.py
│ │ ├── deepep/
│ │ │ ├── deepep_utils.py
│ │ │ └── tuning_deepep.py
│ │ ├── deepseek/
│ │ │ ├── README.md
│ │ │ ├── benchmark_deepgemm_fp8_gemm.py
│ │ │ ├── benchmark_deepgemm_fp8_gemm_blackwell.py
│ │ │ └── benchmark_deepgemm_fp8_group_gemm.py
│ │ ├── elementwise/
│ │ │ └── benchmark_concat_mla.py
│ │ ├── flashinfer_allreduce_fusion/
│ │ │ ├── README.md
│ │ │ └── benchmark_fused_collective.py
│ │ ├── fused_moe_triton/
│ │ │ ├── README.md
│ │ │ ├── benchmark_sglang_fused_moe_triton.py
│ │ │ ├── benchmark_torch_compile_fused_moe.py
│ │ │ ├── benchmark_vllm_vs_sglang_fused_moe_triton.py
│ │ │ ├── common_utils.py
│ │ │ ├── tuning_client.py
│ │ │ ├── tuning_fused_moe_triton.py
│ │ │ ├── tuning_fused_moe_triton_sep.py
│ │ │ └── tuning_text.json
│ │ ├── quantization/
│ │ │ ├── README.md
│ │ │ ├── bench_fp4_quant.py
│ │ │ ├── bench_int8_quant.py
│ │ │ └── tuning_block_wise_kernel.py
│ │ ├── scheduler_batch/
│ │ │ ├── benchmark_get_last_loc_triton.py
│ │ │ └── benchmark_write_req_to_token_pool_triton.py
│ │ └── sliding_window_attention_triton/
│ │ └── bench_triton_swa_kernel.py
│ ├── line_retrieval/
│ │ ├── README.md
│ │ ├── bench_sglang.py
│ │ └── gen_data.py
│ ├── llava_bench/
│ │ ├── README.md
│ │ ├── bench_hf_llava_bench.sh
│ │ ├── bench_hf_mme.sh
│ │ ├── bench_sglang.py
│ │ ├── bench_sglang_mme.sh
│ │ └── download_images.py
│ ├── llm_judge/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── long_json_decode/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── lora/
│ │ ├── launch_server.py
│ │ └── lora_bench.py
│ ├── mmlu/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── download_data.sh
│ ├── mmmu/
│ │ ├── README.md
│ │ ├── bench_hf.py
│ │ ├── bench_sglang.py
│ │ ├── data_utils.py
│ │ ├── eval_utils.py
│ │ └── prompt_format.yaml
│ ├── mtbench/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── bench_sglang_eagle.py
│ ├── multi_chain_reasoning/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── multi_document_qa/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── multi_turn_chat/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── data_gen.py
│ │ └── long_prompt_multi_turn.py
│ ├── prefill_only/
│ │ ├── bench_embeddings.py
│ │ ├── bench_score.py
│ │ └── util.py
│ ├── react/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── reasoning_benchmark/
│ │ ├── README.md
│ │ ├── answer_extraction.py
│ │ ├── bench_sglang.py
│ │ └── eval_utils.py
│ ├── tip_suggestion/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── lmql_funcs.py
│ │ └── topic.jsonl
│ ├── tree_of_thought_deep/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── lmql_funcs.py
│ └── tree_of_thought_v0/
│ ├── README.md
│ ├── bench_other.py
│ └── bench_sglang.py
├── docker/
│ ├── Dockerfile
│ ├── compose.yaml
│ ├── configs/
│ │ ├── .zshrc
│ │ ├── opt/
│ │ │ ├── .gitconfig
│ │ │ ├── .tmux.conf
│ │ │ └── .vimrc
│ │ └── yank
│ ├── diffusion.Dockerfile
│ ├── gateway.Dockerfile
│ ├── k8s-sglang-distributed-sts.yaml
│ ├── k8s-sglang-service.yaml
│ ├── npu.Dockerfile
│ ├── rocm.Dockerfile
│ ├── sagemaker.Dockerfile
│ ├── serve
│ ├── xeon.Dockerfile
│ └── xpu.Dockerfile
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── _static/
│ │ └── css/
│ │ ├── custom_log.css
│ │ └── readthedocs.css
│ ├── advanced_features/
│ │ ├── attention_backend.md
│ │ ├── checkpoint_engine.md
│ │ ├── cuda_graph_for_multi_modal_encoder.md
│ │ ├── deterministic_inference.md
│ │ ├── dp_dpa_smg_guide.md
│ │ ├── dp_for_multi_modal_encoder.md
│ │ ├── epd_disaggregation.md
│ │ ├── expert_parallelism.md
│ │ ├── forward_hooks.md
│ │ ├── hicache.rst
│ │ ├── hicache_best_practices.md
│ │ ├── hicache_design.md
│ │ ├── hicache_storage_runtime_attach_detach.md
│ │ ├── hyperparameter_tuning.md
│ │ ├── lora.ipynb
│ │ ├── observability.md
│ │ ├── pd_disaggregation.md
│ │ ├── piecewise_cuda_graph.md
│ │ ├── pipeline_parallelism.md
│ │ ├── quantization.md
│ │ ├── quantized_kv_cache.md
│ │ ├── rfork.md
│ │ ├── separate_reasoning.ipynb
│ │ ├── server_arguments.md
│ │ ├── sgl_model_gateway.md
│ │ ├── sglang_for_rl.md
│ │ ├── speculative_decoding.md
│ │ ├── structured_outputs.ipynb
│ │ ├── structured_outputs_for_reasoning_models.ipynb
│ │ ├── tool_parser.ipynb
│ │ └── vlm_query.ipynb
│ ├── basic_usage/
│ │ ├── deepseek_ocr.md
│ │ ├── deepseek_v3.md
│ │ ├── deepseek_v32.md
│ │ ├── glm45.md
│ │ ├── glmv.md
│ │ ├── gpt_oss.md
│ │ ├── llama4.md
│ │ ├── minimax_m2.md
│ │ ├── native_api.ipynb
│ │ ├── offline_engine_api.ipynb
│ │ ├── ollama_api.md
│ │ ├── openai_api.rst
│ │ ├── openai_api_completions.ipynb
│ │ ├── openai_api_embeddings.ipynb
│ │ ├── openai_api_vision.ipynb
│ │ ├── popular_model_usage.rst
│ │ ├── qwen3.md
│ │ ├── qwen3_5.md
│ │ ├── qwen3_vl.md
│ │ ├── sampling_params.md
│ │ └── send_request.ipynb
│ ├── conf.py
│ ├── deploy.py
│ ├── developer_guide/
│ │ ├── bench_serving.md
│ │ ├── benchmark_and_profiling.md
│ │ ├── contribution_guide.md
│ │ ├── development_guide_using_docker.md
│ │ ├── development_jit_kernel_guide.md
│ │ ├── evaluating_new_models.md
│ │ ├── release_process.md
│ │ └── setup_github_runner.md
│ ├── diffusion/
│ │ ├── api/
│ │ │ ├── cli.md
│ │ │ ├── openai_api.md
│ │ │ └── post_processing.md
│ │ ├── ci_perf.md
│ │ ├── compatibility_matrix.md
│ │ ├── contributing.md
│ │ ├── environment_variables.md
│ │ ├── index.md
│ │ ├── installation.md
│ │ ├── performance/
│ │ │ ├── attention_backends.md
│ │ │ ├── cache/
│ │ │ │ ├── cache_dit.md
│ │ │ │ ├── index.md
│ │ │ │ └── teacache.md
│ │ │ ├── index.md
│ │ │ └── profiling.md
│ │ └── support_new_models.md
│ ├── get_started/
│ │ └── install.md
│ ├── index.rst
│ ├── performance_dashboard/
│ │ ├── README.md
│ │ ├── app.js
│ │ ├── fetch_metrics.py
│ │ ├── index.html
│ │ └── server.py
│ ├── platforms/
│ │ ├── amd_gpu.md
│ │ ├── apple_metal.md
│ │ ├── ascend_contribution_guide.md
│ │ ├── ascend_npu.md
│ │ ├── ascend_npu_best_practice.md
│ │ ├── ascend_npu_deepseek_example.md
│ │ ├── ascend_npu_environment_variables.md
│ │ ├── ascend_npu_glm5_examples.md
│ │ ├── ascend_npu_quantization.md
│ │ ├── ascend_npu_qwen3_5_examples.md
│ │ ├── ascend_npu_qwen3_examples.md
│ │ ├── ascend_npu_support.rst
│ │ ├── ascend_npu_support_features.md
│ │ ├── ascend_npu_support_models.md
│ │ ├── cpu_server.md
│ │ ├── mindspore_backend.md
│ │ ├── mthreads_gpu.md
│ │ ├── nvidia_jetson.md
│ │ ├── tpu.md
│ │ └── xpu.md
│ ├── references/
│ │ ├── custom_chat_template.md
│ │ ├── environment_variables.md
│ │ ├── faq.md
│ │ ├── frontend/
│ │ │ ├── choices_methods.md
│ │ │ ├── frontend_index.rst
│ │ │ └── frontend_tutorial.ipynb
│ │ ├── learn_more.md
│ │ ├── multi_node_deployment/
│ │ │ ├── deploy_on_k8s.md
│ │ │ ├── lws_pd/
│ │ │ │ ├── lws-examples/
│ │ │ │ │ ├── d-svc.yaml
│ │ │ │ │ ├── d.yaml
│ │ │ │ │ ├── lb.yaml
│ │ │ │ │ ├── p-svc.yaml
│ │ │ │ │ └── p.yaml
│ │ │ │ └── lws_pd_deploy.md
│ │ │ ├── multi_node.md
│ │ │ ├── multi_node_index.rst
│ │ │ └── rbg_pd/
│ │ │ └── deepseekv32_pd.md
│ │ ├── post_training_integration.md
│ │ ├── production_metrics.md
│ │ ├── production_request_trace.md
│ │ ├── release_lookup.rst
│ │ └── torch_compile_cache.md
│ ├── release_lookup/
│ │ ├── README.md
│ │ ├── generate_index.py
│ │ ├── index.html
│ │ └── release_index.json
│ ├── requirements.txt
│ ├── serve.sh
│ ├── supported_models/
│ │ ├── extending/
│ │ │ ├── index.rst
│ │ │ ├── mindspore_models.md
│ │ │ ├── modelscope.md
│ │ │ ├── support_new_models.md
│ │ │ └── transformers_fallback.md
│ │ ├── index.rst
│ │ ├── retrieval_ranking/
│ │ │ ├── classify_models.md
│ │ │ ├── embedding_models.md
│ │ │ ├── index.rst
│ │ │ └── rerank_models.md
│ │ ├── specialized/
│ │ │ ├── index.rst
│ │ │ └── reward_models.md
│ │ └── text_generation/
│ │ ├── diffusion_language_models.md
│ │ ├── generative_models.md
│ │ ├── index.rst
│ │ └── multimodal_language_models.md
│ └── wrap_run_llm.py
├── examples/
│ ├── assets/
│ │ └── .gitignore
│ ├── chat_template/
│ │ ├── qwen3_reranker.jinja
│ │ ├── qwen3_vl_reranker.jinja
│ │ ├── tool_chat_template_deepseekr1.jinja
│ │ ├── tool_chat_template_deepseekv3.jinja
│ │ ├── tool_chat_template_deepseekv31.jinja
│ │ ├── tool_chat_template_deepseekv32.jinja
│ │ ├── tool_chat_template_llama3.1_json.jinja
│ │ ├── tool_chat_template_llama4_pythonic.jinja
│ │ └── vision_template_sarashina_vl.jinja
│ ├── checkpoint_engine/
│ │ └── update.py
│ ├── frontend_language/
│ │ ├── quick_start/
│ │ │ ├── anthropic_example_chat.py
│ │ │ ├── anthropic_example_complete.py
│ │ │ ├── azure_openai_example_chat.py
│ │ │ ├── gemini_example_chat.py
│ │ │ ├── gemini_example_complete.py
│ │ │ ├── gemini_example_multimodal_chat.py
│ │ │ ├── local_example_chat.py
│ │ │ ├── local_example_complete.py
│ │ │ ├── local_example_llava_next.py
│ │ │ ├── openai_example_chat.py
│ │ │ ├── openai_example_complete.py
│ │ │ ├── openai_example_n.py
│ │ │ ├── openai_example_o1.py
│ │ │ ├── openrouter_example_chat.py
│ │ │ ├── together_example_chat.py
│ │ │ └── together_example_complete.py
│ │ └── usage/
│ │ ├── chinese_regex.py
│ │ ├── choices_logprob.py
│ │ ├── cot_decoding.py
│ │ ├── json_decode.py
│ │ ├── json_logprobs.py
│ │ ├── llava_video/
│ │ │ ├── srt_example_llava_v.py
│ │ │ └── srt_example_llava_v.sh
│ │ ├── openai_chat_speculative.py
│ │ ├── openai_speculative.py
│ │ ├── parallel_sample.py
│ │ ├── rag_using_parea/
│ │ │ └── trace_and_evaluate_rag_using_parea.ipynb
│ │ ├── readme_examples.py
│ │ ├── sgl_gen_min_tokens.py
│ │ ├── streaming.py
│ │ └── triton/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── models/
│ │ └── character_generation/
│ │ ├── 1/
│ │ │ └── model.py
│ │ └── config.pbtxt
│ ├── monitoring/
│ │ ├── README.md
│ │ ├── docker-compose.yaml
│ │ ├── grafana/
│ │ │ ├── dashboards/
│ │ │ │ ├── config/
│ │ │ │ │ └── dashboard.yaml
│ │ │ │ └── json/
│ │ │ │ └── sglang-dashboard.json
│ │ │ └── datasources/
│ │ │ └── datasource.yaml
│ │ ├── opentelemetry.yaml
│ │ ├── prometheus.yaml
│ │ └── tracing_compose.yaml
│ ├── profiler/
│ │ └── nsys_profile_tools/
│ │ ├── README.md
│ │ ├── gputrc2graph.py
│ │ └── sglang_engine_model.json
│ ├── runtime/
│ │ ├── README.md
│ │ ├── engine/
│ │ │ ├── custom_server.py
│ │ │ ├── embedding.py
│ │ │ ├── fastapi_engine_inference.py
│ │ │ ├── launch_engine.py
│ │ │ ├── offline_batch_inference.py
│ │ │ ├── offline_batch_inference_async.py
│ │ │ ├── offline_batch_inference_eagle.py
│ │ │ ├── offline_batch_inference_qwen_1m.py
│ │ │ ├── offline_batch_inference_vlm.py
│ │ │ ├── readme.md
│ │ │ ├── save_remote_state.py
│ │ │ └── save_sharded_state.py
│ │ ├── hidden_states/
│ │ │ ├── hidden_states_engine.py
│ │ │ └── hidden_states_server.py
│ │ ├── lora.py
│ │ ├── multimodal/
│ │ │ ├── llama3_llava_server.py
│ │ │ ├── llava_onevision_server.py
│ │ │ ├── pixtral_server.py
│ │ │ └── qwen_llava_server.py
│ │ ├── multimodal_embedding.py
│ │ ├── openai_chat_with_response_prefill.py
│ │ ├── qwen3_vl_reranker.py
│ │ ├── reward_model.py
│ │ ├── token_in_token_out/
│ │ │ ├── token_in_token_out_llm_engine.py
│ │ │ ├── token_in_token_out_llm_server.py
│ │ │ ├── token_in_token_out_vlm_engine.py
│ │ │ └── token_in_token_out_vlm_server.py
│ │ └── vertex_predict.py
│ ├── sagemaker/
│ │ └── deploy_and_serve_endpoint.py
│ └── usage/
│ └── modelopt_quantize_and_export.py
├── python/
│ ├── pyproject.toml
│ ├── pyproject_cpu.toml
│ ├── pyproject_npu.toml
│ ├── pyproject_other.toml
│ ├── pyproject_xpu.toml
│ └── sglang/
│ ├── README.md
│ ├── __init__.py
│ ├── _mps_stub.py
│ ├── _triton_stub.py
│ ├── bench_offline_throughput.py
│ ├── bench_one_batch.py
│ ├── bench_one_batch_server.py
│ ├── bench_serving.py
│ ├── benchmark/
│ │ ├── __init__.py
│ │ ├── bench_utils.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── custom.py
│ │ │ ├── generated_shared_prefix.py
│ │ │ ├── image.py
│ │ │ ├── mmmu.py
│ │ │ ├── mooncake.py
│ │ │ ├── openai_dataset.py
│ │ │ ├── random.py
│ │ │ └── sharegpt.py
│ │ └── utils.py
│ ├── check_env.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── generate.py
│ │ ├── main.py
│ │ ├── serve.py
│ │ └── utils.py
│ ├── compile_deep_gemm.py
│ ├── eval/
│ │ ├── llama3_eval.py
│ │ └── loogle_eval.py
│ ├── global_config.py
│ ├── jit_kernel/
│ │ ├── .clang-format
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── add_constant.py
│ │ ├── awq_dequantize.py
│ │ ├── awq_marlin_repack.py
│ │ ├── benchmark/
│ │ │ ├── bench_awq_dequantize.py
│ │ │ ├── bench_awq_marlin_moe_repack.py
│ │ │ ├── bench_awq_marlin_repack.py
│ │ │ ├── bench_concat_mla.py
│ │ │ ├── bench_fused_add_rmsnorm.py
│ │ │ ├── bench_fused_norm_scale_shift.py
│ │ │ ├── bench_gptq_marlin.py
│ │ │ ├── bench_gptq_marlin_repack.py
│ │ │ ├── bench_hadamard.py
│ │ │ ├── bench_hicache.py
│ │ │ ├── bench_moe_wna16_marlin.py
│ │ │ ├── bench_norm.py
│ │ │ ├── bench_norm_impls.py
│ │ │ ├── bench_nvfp4_blockwise_moe.py
│ │ │ ├── bench_nvfp4_quant.py
│ │ │ ├── bench_nvfp4_scaled_mm.py
│ │ │ ├── bench_per_tensor_quant_fp8.py
│ │ │ ├── bench_per_token_group_quant_8bit.py
│ │ │ ├── bench_qknorm.py
│ │ │ ├── bench_qknorm_across_heads.py
│ │ │ ├── bench_qwen_image_modulation.py
│ │ │ ├── bench_renorm.py
│ │ │ ├── bench_rmsnorm.py
│ │ │ ├── bench_rope.py
│ │ │ ├── bench_store_cache.py
│ │ │ └── utils.py
│ │ ├── concat_mla.py
│ │ ├── csrc/
│ │ │ ├── add_constant.cuh
│ │ │ ├── diffusion/
│ │ │ │ └── timestep_embedding.cuh
│ │ │ ├── elementwise/
│ │ │ │ ├── concat_mla.cuh
│ │ │ │ ├── fused_add_rmsnorm.cuh
│ │ │ │ ├── fused_metadata_copy.cuh
│ │ │ │ ├── kvcache.cuh
│ │ │ │ ├── pos_enc.cuh
│ │ │ │ ├── qknorm.cuh
│ │ │ │ ├── qknorm_across_heads.cuh
│ │ │ │ ├── rmsnorm.cuh
│ │ │ │ └── rope.cuh
│ │ │ ├── fast-hadamard-transform/
│ │ │ │ ├── code_gen.py
│ │ │ │ ├── fast_hadamard_transform.h
│ │ │ │ ├── fast_hadamard_transform_common.h
│ │ │ │ ├── fast_hadamard_transform_special.h
│ │ │ │ ├── hadamard_jit.cuh
│ │ │ │ └── static_switch.h
│ │ │ ├── gemm/
│ │ │ │ ├── awq_dequantize.cuh
│ │ │ │ ├── marlin/
│ │ │ │ │ ├── awq_marlin_repack.cuh
│ │ │ │ │ ├── dequant.h
│ │ │ │ │ ├── gptq_marlin.cuh
│ │ │ │ │ ├── gptq_marlin_repack.cuh
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── marlin.cuh
│ │ │ │ │ ├── marlin_dtypes.cuh
│ │ │ │ │ └── marlin_template.h
│ │ │ │ ├── marlin_moe/
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── marlin_template.h
│ │ │ │ │ └── moe_wna16_marlin.cuh
│ │ │ │ ├── nvfp4/
│ │ │ │ │ ├── nvfp4_expert_quant.cuh
│ │ │ │ │ ├── nvfp4_quant.cuh
│ │ │ │ │ ├── nvfp4_quant_entry.cuh
│ │ │ │ │ ├── nvfp4_quant_kernels.cuh
│ │ │ │ │ ├── nvfp4_scaled_mm_entry.cuh
│ │ │ │ │ └── nvfp4_scaled_mm_kernels.cuh
│ │ │ │ ├── per_tensor_quant_fp8.cuh
│ │ │ │ └── per_token_group_quant_8bit.cuh
│ │ │ ├── hicache.cuh
│ │ │ ├── lora/
│ │ │ │ └── moe_lora_align_kernel.cu
│ │ │ ├── moe/
│ │ │ │ └── nvfp4_blockwise_moe.cuh
│ │ │ ├── ngram_embedding.cuh
│ │ │ └── nsa/
│ │ │ └── fused_store_index_cache.cuh
│ │ ├── cutedsl_gdn.py
│ │ ├── diffusion/
│ │ │ ├── cutedsl/
│ │ │ │ ├── common/
│ │ │ │ │ ├── norm_fusion.py
│ │ │ │ │ └── reduce.py
│ │ │ │ ├── scale_residual_norm_scale_shift.py
│ │ │ │ └── utils.py
│ │ │ └── triton/
│ │ │ ├── mps_fallback.py
│ │ │ ├── norm.py
│ │ │ ├── npu_fallback.py
│ │ │ ├── rmsnorm_onepass.py
│ │ │ ├── rotary.py
│ │ │ └── scale_shift.py
│ │ ├── flash_attention_v4.py
│ │ ├── fused_metadata_copy.py
│ │ ├── fused_store_index_cache.py
│ │ ├── gptq_marlin.py
│ │ ├── gptq_marlin_repack.py
│ │ ├── hadamard.py
│ │ ├── hicache.py
│ │ ├── include/
│ │ │ └── sgl_kernel/
│ │ │ ├── atomic.cuh
│ │ │ ├── cta.cuh
│ │ │ ├── impl/
│ │ │ │ └── norm.cuh
│ │ │ ├── math.cuh
│ │ │ ├── runtime.cuh
│ │ │ ├── scalar_type.hpp
│ │ │ ├── source_location.h
│ │ │ ├── tensor.h
│ │ │ ├── tile.cuh
│ │ │ ├── type.cuh
│ │ │ ├── utils.cuh
│ │ │ ├── utils.h
│ │ │ ├── vec.cuh
│ │ │ └── warp.cuh
│ │ ├── kvcache.py
│ │ ├── moe_lora_align.py
│ │ ├── moe_wna16_marlin.py
│ │ ├── ngram_embedding.py
│ │ ├── norm.py
│ │ ├── nvfp4.py
│ │ ├── per_tensor_quant_fp8.py
│ │ ├── per_token_group_quant_8bit.py
│ │ ├── rope.py
│ │ ├── tests/
│ │ │ ├── test_add_constant.py
│ │ │ ├── test_awq_dequantize.py
│ │ │ ├── test_awq_marlin_moe_repack.py
│ │ │ ├── test_awq_marlin_repack.py
│ │ │ ├── test_concat_mla.py
│ │ │ ├── test_cutedsl_gdn.py
│ │ │ ├── test_flash_attention_4.py
│ │ │ ├── test_fused_add_rmsnorm.py
│ │ │ ├── test_fused_metadata_copy.py
│ │ │ ├── test_fused_norm_scale_shift.py
│ │ │ ├── test_fused_store_index_cache.py
│ │ │ ├── test_fused_verify_triton_gdn.py
│ │ │ ├── test_gptq_marlin.py
│ │ │ ├── test_gptq_marlin_repack.py
│ │ │ ├── test_hadamard_jit.py
│ │ │ ├── test_moe_lora_align_block_size.py
│ │ │ ├── test_moe_wna16_marlin.py
│ │ │ ├── test_norm_jit.py
│ │ │ ├── test_nvfp4_blockwise_moe.py
│ │ │ ├── test_nvfp4_gemm.py
│ │ │ ├── test_nvfp4_quant.py
│ │ │ ├── test_per_tensor_quant_fp8.py
│ │ │ ├── test_per_token_group_quant_8bit.py
│ │ │ ├── test_pos_enc.py
│ │ │ ├── test_qknorm.py
│ │ │ ├── test_qknorm_across_heads.py
│ │ │ ├── test_qwen_image_modulation.py
│ │ │ ├── test_renorm.py
│ │ │ ├── test_rmsnorm.py
│ │ │ ├── test_rope.py
│ │ │ ├── test_store_cache.py
│ │ │ └── test_timestep_embedding.py
│ │ ├── timestep_embedding.py
│ │ └── utils.py
│ ├── lang/
│ │ ├── api.py
│ │ ├── backend/
│ │ │ ├── anthropic.py
│ │ │ ├── base_backend.py
│ │ │ ├── litellm.py
│ │ │ ├── openai.py
│ │ │ ├── runtime_endpoint.py
│ │ │ └── vertexai.py
│ │ ├── chat_template.py
│ │ ├── choices.py
│ │ ├── interpreter.py
│ │ ├── ir.py
│ │ └── tracer.py
│ ├── launch_server.py
│ ├── multimodal_gen/
│ │ ├── .claude/
│ │ │ ├── CLAUDE.md
│ │ │ └── skills/
│ │ │ ├── diffusion-kernel/
│ │ │ │ ├── SKILL.md
│ │ │ │ ├── add-cuda-kernel.md
│ │ │ │ ├── add-triton-kernel.md
│ │ │ │ ├── diffusion-benchmark-and-profile.md
│ │ │ │ ├── nsight-profiler.md
│ │ │ │ ├── references/
│ │ │ │ │ ├── a100-optimization-guide.md
│ │ │ │ │ ├── h100-optimization-guide.md
│ │ │ │ │ ├── kernel-templates.md
│ │ │ │ │ ├── t4-optimization-guide.md
│ │ │ │ │ └── troubleshooting.md
│ │ │ │ ├── scripts/
│ │ │ │ │ ├── bench_diffusion_denoise.py
│ │ │ │ │ ├── bench_diffusion_rmsnorm.py
│ │ │ │ │ └── diffusion_skill_env.py
│ │ │ │ └── use-efficient-diffusion-kernels.md
│ │ │ ├── diffusion-optimal-perf/
│ │ │ │ └── SKILL.md
│ │ │ └── support-new-model/
│ │ │ └── SKILL.md
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── apps/
│ │ │ ├── ComfyUI_SGLDiffusion/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── core/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generator.py
│ │ │ │ │ ├── model_patcher.py
│ │ │ │ │ └── server_api.py
│ │ │ │ ├── executors/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── nodes.py
│ │ │ │ ├── test/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test_flux_pipeline.py
│ │ │ │ │ ├── test_qwen_image_edit_pipeline.py
│ │ │ │ │ ├── test_qwen_image_pipeline.py
│ │ │ │ │ └── test_zimage_pipeline.py
│ │ │ │ ├── utils.py
│ │ │ │ └── workflows/
│ │ │ │ ├── flux_sgld_sp.json
│ │ │ │ ├── qwen_image_sgld.json
│ │ │ │ ├── sgld_image2video.json
│ │ │ │ ├── sgld_text2img.json
│ │ │ │ └── z-image_sgld.json
│ │ │ └── webui/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── main.py
│ │ ├── benchmarks/
│ │ │ ├── bench_offline_throughput.py
│ │ │ ├── bench_serving.py
│ │ │ ├── compare_perf.py
│ │ │ └── datasets.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── backend/
│ │ │ │ └── vmoba/
│ │ │ │ ├── wan_1.3B_77_448_832.json
│ │ │ │ └── wan_1.3B_77_480_832.json
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adapter/
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── ltx_2_connector.py
│ │ │ │ ├── base.py
│ │ │ │ ├── bridges/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── mova_dual_tower.py
│ │ │ │ ├── dits/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── glmimage.py
│ │ │ │ │ ├── helios.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvideo.py
│ │ │ │ │ ├── ltx_2.py
│ │ │ │ │ ├── mova_audio.py
│ │ │ │ │ ├── mova_video.py
│ │ │ │ │ ├── qwenimage.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ ├── wanvideo.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── clip.py
│ │ │ │ │ ├── gemma2.py
│ │ │ │ │ ├── gemma_3.py
│ │ │ │ │ ├── llama.py
│ │ │ │ │ ├── qwen3.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ └── t5.py
│ │ │ │ ├── vaes/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── dac.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── glmimage.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvae.py
│ │ │ │ │ ├── ltx_audio.py
│ │ │ │ │ ├── ltx_video.py
│ │ │ │ │ ├── qwenimage.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ └── wanvae.py
│ │ │ │ └── vocoder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── ltx_vocoder.py
│ │ │ ├── pipeline_configs/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── diffusers_generic.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── flux_finetuned.py
│ │ │ │ ├── glm_image.py
│ │ │ │ ├── helios.py
│ │ │ │ ├── hunyuan.py
│ │ │ │ ├── hunyuan3d.py
│ │ │ │ ├── ltx_2.py
│ │ │ │ ├── mova.py
│ │ │ │ ├── qwen_image.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── wan.py
│ │ │ │ └── zimage.py
│ │ │ ├── quantization.py
│ │ │ ├── sample/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── diffusers_generic.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── glmimage.py
│ │ │ │ ├── helios.py
│ │ │ │ ├── hunyuan.py
│ │ │ │ ├── hunyuan3d.py
│ │ │ │ ├── ltx_2.py
│ │ │ │ ├── mova.py
│ │ │ │ ├── qwenimage.py
│ │ │ │ ├── sampling_params.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── teacache.py
│ │ │ │ ├── wan.py
│ │ │ │ └── zimage.py
│ │ │ └── utils.py
│ │ ├── csrc/
│ │ │ ├── attn/
│ │ │ │ └── vmoba_attn/
│ │ │ │ ├── README.md
│ │ │ │ ├── setup.py
│ │ │ │ ├── tests/
│ │ │ │ │ └── test_vmoba_attn.py
│ │ │ │ └── vmoba/
│ │ │ │ ├── __init__.py
│ │ │ │ └── vmoba.py
│ │ │ └── render/
│ │ │ ├── hunyuan3d_rasterizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── rasterizer.cpp
│ │ │ │ ├── rasterizer.h
│ │ │ │ └── rasterizer_gpu.cu
│ │ │ └── mesh_processor/
│ │ │ ├── __init__.py
│ │ │ └── mesh_processor.cpp
│ │ ├── docs/
│ │ │ └── quantization.md
│ │ ├── envs.py
│ │ ├── registry.py
│ │ ├── runtime/
│ │ │ ├── cache/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cache_dit_integration.py
│ │ │ │ └── teacache.py
│ │ │ ├── distributed/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── communication_op.py
│ │ │ │ ├── device_communicators/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_device_communicator.py
│ │ │ │ │ ├── cpu_communicator.py
│ │ │ │ │ ├── cuda_communicator.py
│ │ │ │ │ ├── pynccl.py
│ │ │ │ │ └── pynccl_wrapper.py
│ │ │ │ ├── group_coordinator.py
│ │ │ │ ├── parallel_groups.py
│ │ │ │ ├── parallel_state.py
│ │ │ │ └── utils.py
│ │ │ ├── entrypoints/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cli/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cli_types.py
│ │ │ │ │ ├── generate.py
│ │ │ │ │ ├── main.py
│ │ │ │ │ ├── serve.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── diffusion_generator.py
│ │ │ │ ├── http_server.py
│ │ │ │ ├── openai/
│ │ │ │ │ ├── common_api.py
│ │ │ │ │ ├── image_api.py
│ │ │ │ │ ├── mesh_api.py
│ │ │ │ │ ├── protocol.py
│ │ │ │ │ ├── storage.py
│ │ │ │ │ ├── stores.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── video_api.py
│ │ │ │ ├── post_training/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── io_struct.py
│ │ │ │ │ └── weights_api.py
│ │ │ │ └── utils.py
│ │ │ ├── launch_server.py
│ │ │ ├── layers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── activation.py
│ │ │ │ ├── attention/
│ │ │ │ │ ├── STA_configuration.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── backends/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── aiter.py
│ │ │ │ │ │ ├── aiter_sage.py
│ │ │ │ │ │ ├── attention_backend.py
│ │ │ │ │ │ ├── flash_attn.py
│ │ │ │ │ │ ├── flash_attn_2.py
│ │ │ │ │ │ ├── sage_attn.py
│ │ │ │ │ │ ├── sage_attn3.py
│ │ │ │ │ │ ├── sdpa.py
│ │ │ │ │ │ ├── sliding_tile_attn.py
│ │ │ │ │ │ ├── sparse_linear_attn.py
│ │ │ │ │ │ ├── sparse_video_gen_2_attn.py
│ │ │ │ │ │ ├── video_sparse_attn.py
│ │ │ │ │ │ └── vmoba.py
│ │ │ │ │ ├── layer.py
│ │ │ │ │ ├── selector.py
│ │ │ │ │ └── turbo_layer.py
│ │ │ │ ├── custom_op.py
│ │ │ │ ├── elementwise.py
│ │ │ │ ├── layernorm.py
│ │ │ │ ├── linear.py
│ │ │ │ ├── lora/
│ │ │ │ │ └── linear.py
│ │ │ │ ├── mlp.py
│ │ │ │ ├── quantization/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ ├── base_config.py
│ │ │ │ │ │ └── nunchaku_config.py
│ │ │ │ │ ├── fp8.py
│ │ │ │ │ ├── modelslim.py
│ │ │ │ │ └── nunchaku_linear.py
│ │ │ │ ├── rotary_embedding/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── mrope.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── usp.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── visual_embedding.py
│ │ │ │ └── vocab_parallel_embedding.py
│ │ │ ├── loader/
│ │ │ │ ├── component_loaders/
│ │ │ │ │ ├── adapter_loader.py
│ │ │ │ │ ├── bridge_loader.py
│ │ │ │ │ ├── component_loader.py
│ │ │ │ │ ├── image_encoder_loader.py
│ │ │ │ │ ├── scheduler_loader.py
│ │ │ │ │ ├── text_encoder_loader.py
│ │ │ │ │ ├── transformer_loader.py
│ │ │ │ │ ├── vae_loader.py
│ │ │ │ │ ├── vl_encoder_loader.py
│ │ │ │ │ └── vocoder_loader.py
│ │ │ │ ├── fsdp_load.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── weight_utils.py
│ │ │ │ └── weights_updater.py
│ │ │ ├── managers/
│ │ │ │ ├── forward_context.py
│ │ │ │ ├── gpu_worker.py
│ │ │ │ └── scheduler.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adapter/
│ │ │ │ │ └── ltx_2_connector.py
│ │ │ │ ├── bridges/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── mova_dual_tower.py
│ │ │ │ ├── dits/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── causal_wanvideo.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── flux_2.py
│ │ │ │ │ ├── glm_image.py
│ │ │ │ │ ├── helios.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvideo.py
│ │ │ │ │ ├── ltx_2.py
│ │ │ │ │ ├── mova_audio_dit.py
│ │ │ │ │ ├── mova_video_dit.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ ├── wanvideo.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── bert.py
│ │ │ │ │ ├── clip.py
│ │ │ │ │ ├── gemma2.py
│ │ │ │ │ ├── gemma_3.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── llama.py
│ │ │ │ │ ├── mistral_3.py
│ │ │ │ │ ├── qwen2_5vl.py
│ │ │ │ │ ├── qwen3.py
│ │ │ │ │ ├── t5.py
│ │ │ │ │ └── vision.py
│ │ │ │ ├── parameter.py
│ │ │ │ ├── registry.py
│ │ │ │ ├── schedulers/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flow_match_pair.py
│ │ │ │ │ ├── hunyuan3d_scheduler.py
│ │ │ │ │ ├── scheduling_comfyui_passthrough.py
│ │ │ │ │ ├── scheduling_dpm_solver_multistep.py
│ │ │ │ │ ├── scheduling_flow_match_euler_discrete.py
│ │ │ │ │ ├── scheduling_flow_unipc_multistep.py
│ │ │ │ │ ├── scheduling_helios.py
│ │ │ │ │ ├── scheduling_self_forcing_flow_match.py
│ │ │ │ │ └── scheduling_unipc_multistep.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── vaes/
│ │ │ │ │ ├── autoencoder.py
│ │ │ │ │ ├── autoencoder_dc.py
│ │ │ │ │ ├── autoencoder_kl_flux2.py
│ │ │ │ │ ├── autoencoder_kl_qwenimage.py
│ │ │ │ │ ├── common.py
│ │ │ │ │ ├── dac.py
│ │ │ │ │ ├── hunyuan3d_vae.py
│ │ │ │ │ ├── hunyuanvae.py
│ │ │ │ │ ├── ltx_2_audio.py
│ │ │ │ │ ├── ltx_2_vae.py
│ │ │ │ │ ├── parallel/
│ │ │ │ │ │ ├── wan_common_utils.py
│ │ │ │ │ │ └── wan_dist_utils.py
│ │ │ │ │ └── wanvae.py
│ │ │ │ ├── vision_utils.py
│ │ │ │ └── vocoder/
│ │ │ │ └── ltx_2_vocoder.py
│ │ │ ├── pipelines/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── comfyui_flux_pipeline.py
│ │ │ │ ├── comfyui_qwen_image_pipeline.py
│ │ │ │ ├── comfyui_zimage_pipeline.py
│ │ │ │ ├── diffusers_pipeline.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── flux_2.py
│ │ │ │ ├── flux_2_klein.py
│ │ │ │ ├── glm_image.py
│ │ │ │ ├── helios_pipeline.py
│ │ │ │ ├── hunyuan3d_pipeline.py
│ │ │ │ ├── hunyuan_pipeline.py
│ │ │ │ ├── ltx_2_pipeline.py
│ │ │ │ ├── mova_pipeline.py
│ │ │ │ ├── qwen_image.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── wan_causal_dmd_pipeline.py
│ │ │ │ ├── wan_dmd_pipeline.py
│ │ │ │ ├── wan_i2v_dmd_pipeline.py
│ │ │ │ ├── wan_i2v_pipeline.py
│ │ │ │ ├── wan_pipeline.py
│ │ │ │ └── zimage_pipeline.py
│ │ │ ├── pipelines_core/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── composed_pipeline_base.py
│ │ │ │ ├── executors/
│ │ │ │ │ ├── parallel_executor.py
│ │ │ │ │ ├── pipeline_executor.py
│ │ │ │ │ └── sync_executor.py
│ │ │ │ ├── lora_format_adapter.py
│ │ │ │ ├── lora_pipeline.py
│ │ │ │ ├── schedule_batch.py
│ │ │ │ └── stages/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── causal_denoising.py
│ │ │ │ ├── comfyui_latent_preparation.py
│ │ │ │ ├── decoding.py
│ │ │ │ ├── decoding_av.py
│ │ │ │ ├── denoising.py
│ │ │ │ ├── denoising_av.py
│ │ │ │ ├── denoising_dmd.py
│ │ │ │ ├── encoding.py
│ │ │ │ ├── hunyuan3d_paint.py
│ │ │ │ ├── hunyuan3d_shape.py
│ │ │ │ ├── image_encoding.py
│ │ │ │ ├── input_validation.py
│ │ │ │ ├── latent_preparation.py
│ │ │ │ ├── latent_preparation_av.py
│ │ │ │ ├── model_specific_stages/
│ │ │ │ │ ├── glm_image.py
│ │ │ │ │ ├── helios_decoding.py
│ │ │ │ │ ├── helios_denoising.py
│ │ │ │ │ ├── mova.py
│ │ │ │ │ └── qwen_image_layered.py
│ │ │ │ ├── text_connector.py
│ │ │ │ ├── text_encoding.py
│ │ │ │ ├── timestep_preparation.py
│ │ │ │ └── validators.py
│ │ │ ├── platforms/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cpu.py
│ │ │ │ ├── cuda.py
│ │ │ │ ├── interface.py
│ │ │ │ ├── mps.py
│ │ │ │ ├── musa.py
│ │ │ │ ├── npu.py
│ │ │ │ └── rocm.py
│ │ │ ├── postprocess/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── realesrgan_upscaler.py
│ │ │ │ └── rife_interpolator.py
│ │ │ ├── scheduler_client.py
│ │ │ ├── server_args.py
│ │ │ └── utils/
│ │ │ ├── common.py
│ │ │ ├── distributed.py
│ │ │ ├── hf_diffusers_utils.py
│ │ │ ├── layerwise_offload.py
│ │ │ ├── logging_utils.py
│ │ │ ├── mesh3d_utils.py
│ │ │ ├── perf_logger.py
│ │ │ ├── profiler.py
│ │ │ └── quantization_utils.py
│ │ ├── test/
│ │ │ ├── __init__.py
│ │ │ ├── cli/
│ │ │ │ ├── test_generate_common.py
│ │ │ │ ├── test_generate_i2i.py
│ │ │ │ └── test_generate_t2i_perf.py
│ │ │ ├── run_suite.py
│ │ │ ├── scripts/
│ │ │ │ ├── gen_diffusion_ci_outputs.py
│ │ │ │ └── gen_perf_baselines.py
│ │ │ ├── server/
│ │ │ │ ├── ascend/
│ │ │ │ │ ├── perf_baselines_npu.json
│ │ │ │ │ ├── test_server_1_npu.py
│ │ │ │ │ ├── test_server_2_npu.py
│ │ │ │ │ ├── test_server_8_npu.py
│ │ │ │ │ └── testcase_configs_npu.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── perf_baselines.json
│ │ │ │ ├── test_server_2_gpu_a.py
│ │ │ │ ├── test_server_2_gpu_b.py
│ │ │ │ ├── test_server_a.py
│ │ │ │ ├── test_server_b.py
│ │ │ │ ├── test_server_common.py
│ │ │ │ ├── test_server_utils.py
│ │ │ │ ├── test_update_weights_from_disk.py
│ │ │ │ └── testcase_configs.py
│ │ │ ├── slack_utils.py
│ │ │ ├── test_files/
│ │ │ │ ├── launch_flux.json
│ │ │ │ └── launch_wan.json
│ │ │ ├── test_utils.py
│ │ │ └── unit/
│ │ │ ├── test_lora_format_adapter.py
│ │ │ ├── test_sampling_params.py
│ │ │ ├── test_server_args.py
│ │ │ └── test_storage.py
│ │ ├── third_party/
│ │ │ ├── __init__.py
│ │ │ └── pynvml.py
│ │ ├── tools/
│ │ │ ├── convert_hf_to_fp8.py
│ │ │ └── wan_repack.py
│ │ └── utils.py
│ ├── profiler.py
│ ├── srt/
│ │ ├── batch_invariant_ops/
│ │ │ ├── __init__.py
│ │ │ └── batch_invariant_ops.py
│ │ ├── batch_overlap/
│ │ │ ├── operations.py
│ │ │ ├── operations_strategy.py
│ │ │ ├── single_batch_overlap.py
│ │ │ └── two_batch_overlap.py
│ │ ├── checkpoint_engine/
│ │ │ ├── __init__.py
│ │ │ ├── checkpoint_engine_worker.py
│ │ │ └── update.py
│ │ ├── compilation/
│ │ │ ├── backend.py
│ │ │ ├── compilation_config.py
│ │ │ ├── compilation_counter.py
│ │ │ ├── compile.py
│ │ │ ├── compiler_interface.py
│ │ │ ├── cuda_piecewise_backend.py
│ │ │ ├── fix_functionalization.py
│ │ │ ├── fx_utils.py
│ │ │ ├── inductor_pass.py
│ │ │ ├── npu_piecewise_backend.py
│ │ │ ├── pass_manager.py
│ │ │ ├── piecewise_context_manager.py
│ │ │ └── weak_ref_tensor.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── afmoe.py
│ │ │ ├── bailing_hybrid.py
│ │ │ ├── chatglm.py
│ │ │ ├── dbrx.py
│ │ │ ├── deepseek_ocr.py
│ │ │ ├── deepseekvl2.py
│ │ │ ├── device_config.py
│ │ │ ├── dots_ocr.py
│ │ │ ├── dots_vlm.py
│ │ │ ├── exaone.py
│ │ │ ├── falcon_h1.py
│ │ │ ├── granitemoehybrid.py
│ │ │ ├── internvl.py
│ │ │ ├── janus_pro.py
│ │ │ ├── jet_nemotron.py
│ │ │ ├── jet_vlm.py
│ │ │ ├── kimi_k25.py
│ │ │ ├── kimi_linear.py
│ │ │ ├── kimi_vl.py
│ │ │ ├── kimi_vl_moonvit.py
│ │ │ ├── lfm2.py
│ │ │ ├── lfm2_moe.py
│ │ │ ├── load_config.py
│ │ │ ├── longcat_flash.py
│ │ │ ├── mamba_utils.py
│ │ │ ├── model_config.py
│ │ │ ├── modelopt_config.py
│ │ │ ├── nano_nemotron_vl.py
│ │ │ ├── nemotron_h.py
│ │ │ ├── olmo3.py
│ │ │ ├── points_v15_chat.py
│ │ │ ├── qwen3_5.py
│ │ │ ├── qwen3_next.py
│ │ │ ├── qwen3_omni.py
│ │ │ ├── qwen3_vl.py
│ │ │ ├── radio.py
│ │ │ ├── step3_vl.py
│ │ │ ├── step3p5.py
│ │ │ ├── update_config.py
│ │ │ └── utils.py
│ │ ├── connector/
│ │ │ ├── __init__.py
│ │ │ ├── base_connector.py
│ │ │ ├── redis.py
│ │ │ ├── remote_instance.py
│ │ │ ├── s3.py
│ │ │ ├── serde/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── safe_serde.py
│ │ │ │ └── serde.py
│ │ │ └── utils.py
│ │ ├── constants.py
│ │ ├── constrained/
│ │ │ ├── base_grammar_backend.py
│ │ │ ├── grammar_manager.py
│ │ │ ├── llguidance_backend.py
│ │ │ ├── outlines_backend.py
│ │ │ ├── outlines_jump_forward.py
│ │ │ ├── reasoner_grammar_backend.py
│ │ │ ├── triton_ops/
│ │ │ │ └── bitmask_ops.py
│ │ │ ├── utils.py
│ │ │ └── xgrammar_backend.py
│ │ ├── debug_utils/
│ │ │ ├── __init__.py
│ │ │ ├── comparator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── aligner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── axis_aligner.py
│ │ │ │ │ ├── entrypoint/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ ├── traced_types.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ ├── reorderer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ ├── token_aligner/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── concat_steps/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ │ └── thd_seq_lens_loader.py
│ │ │ │ │ │ ├── entrypoint.py
│ │ │ │ │ │ └── smart/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── aux_loader.py
│ │ │ │ │ │ ├── aux_plugins.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ ├── seq_info_builder.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ └── unsharder/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── executor.py
│ │ │ │ │ ├── parallel_info.py
│ │ │ │ │ ├── planner.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── bundle_comparator.py
│ │ │ │ ├── bundle_matcher.py
│ │ │ │ ├── dims_spec/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── comment_parser.py
│ │ │ │ │ ├── dim_parser.py
│ │ │ │ │ ├── dims_parser.py
│ │ │ │ │ ├── modifier_parser.py
│ │ │ │ │ ├── tensor_naming.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── display.py
│ │ │ │ ├── dp_utils.py
│ │ │ │ ├── entrypoint.py
│ │ │ │ ├── log_sink.py
│ │ │ │ ├── meta_overrider.py
│ │ │ │ ├── output_formatter.py
│ │ │ │ ├── output_types.py
│ │ │ │ ├── per_token_visualizer.py
│ │ │ │ ├── preset.py
│ │ │ │ ├── report_sink.py
│ │ │ │ ├── tensor_comparator/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── comparator.py
│ │ │ │ │ ├── formatter.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── utils.py
│ │ │ │ └── visualizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── figure.py
│ │ │ │ ├── panels.py
│ │ │ │ └── preprocessing.py
│ │ │ ├── cuda_coredump.py
│ │ │ ├── dump_comparator.py
│ │ │ ├── dump_loader.py
│ │ │ ├── dumper.py
│ │ │ ├── log_parser.py
│ │ │ ├── model_truncator.py
│ │ │ ├── schedule_simulator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── data_source/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── data_loader.py
│ │ │ │ │ └── data_synthesis.py
│ │ │ │ ├── entrypoint.py
│ │ │ │ ├── gpu_state.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── request.py
│ │ │ │ ├── routers/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── random_router.py
│ │ │ │ │ ├── round_robin_router.py
│ │ │ │ │ └── sticky_router.py
│ │ │ │ ├── schedulers/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── fifo_scheduler.py
│ │ │ │ └── simulator.py
│ │ │ ├── source_patcher/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── code_patcher.py
│ │ │ │ ├── source_editor.py
│ │ │ │ └── types.py
│ │ │ ├── tensor_dump_forward_hook.py
│ │ │ └── text_comparator.py
│ │ ├── disaggregation/
│ │ │ ├── ascend/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── transfer_engine.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── common/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── utils.py
│ │ │ ├── decode.py
│ │ │ ├── decode_kvcache_offload_manager.py
│ │ │ ├── decode_schedule_batch_mixin.py
│ │ │ ├── encode_grpc_server.py
│ │ │ ├── encode_receiver.py
│ │ │ ├── encode_server.py
│ │ │ ├── fake/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── kv_events.py
│ │ │ ├── mooncake/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── utils.py
│ │ │ ├── mori/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── nixl/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── prefill.py
│ │ │ └── utils.py
│ │ ├── distributed/
│ │ │ ├── __init__.py
│ │ │ ├── communication_op.py
│ │ │ ├── device_communicators/
│ │ │ │ ├── all_reduce_utils.py
│ │ │ │ ├── cuda_wrapper.py
│ │ │ │ ├── custom_all_reduce.py
│ │ │ │ ├── custom_all_reduce_ops.py
│ │ │ │ ├── custom_all_reduce_utils.py
│ │ │ │ ├── hpu_communicator.py
│ │ │ │ ├── mooncake_transfer_engine.py
│ │ │ │ ├── npu_communicator.py
│ │ │ │ ├── pymscclpp.py
│ │ │ │ ├── pynccl.py
│ │ │ │ ├── pynccl_allocator.py
│ │ │ │ ├── pynccl_wrapper.py
│ │ │ │ ├── quick_all_reduce.py
│ │ │ │ ├── shm_broadcast.py
│ │ │ │ ├── torch_symm_mem.py
│ │ │ │ └── xpu_communicator.py
│ │ │ ├── naive_distributed.py
│ │ │ ├── parallel_state.py
│ │ │ └── utils.py
│ │ ├── dllm/
│ │ │ ├── algorithm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── joint_threshold.py
│ │ │ │ └── low_confidence.py
│ │ │ ├── config.py
│ │ │ └── mixin/
│ │ │ ├── req.py
│ │ │ └── scheduler.py
│ │ ├── elastic_ep/
│ │ │ ├── elastic_ep.py
│ │ │ ├── expert_backup_client.py
│ │ │ └── expert_backup_manager.py
│ │ ├── entrypoints/
│ │ │ ├── EngineBase.py
│ │ │ ├── anthropic/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── protocol.py
│ │ │ │ └── serving.py
│ │ │ ├── context.py
│ │ │ ├── engine.py
│ │ │ ├── grpc_server.py
│ │ │ ├── harmony_utils.py
│ │ │ ├── http_server.py
│ │ │ ├── http_server_engine.py
│ │ │ ├── ollama/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── protocol.py
│ │ │ │ ├── serving.py
│ │ │ │ └── smart_router.py
│ │ │ ├── openai/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── encoding_dsv32.py
│ │ │ │ ├── protocol.py
│ │ │ │ ├── serving_base.py
│ │ │ │ ├── serving_chat.py
│ │ │ │ ├── serving_classify.py
│ │ │ │ ├── serving_completions.py
│ │ │ │ ├── serving_embedding.py
│ │ │ │ ├── serving_rerank.py
│ │ │ │ ├── serving_responses.py
│ │ │ │ ├── serving_score.py
│ │ │ │ ├── serving_tokenize.py
│ │ │ │ ├── serving_transcription.py
│ │ │ │ ├── tool_server.py
│ │ │ │ ├── usage_processor.py
│ │ │ │ └── utils.py
│ │ │ ├── ssl_utils.py
│ │ │ ├── tool.py
│ │ │ ├── v1_loads.py
│ │ │ └── warmup.py
│ │ ├── environ.py
│ │ ├── eplb/
│ │ │ ├── __init__.py
│ │ │ ├── eplb_algorithms/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── deepseek.py
│ │ │ │ ├── deepseek_vec.py
│ │ │ │ └── elasticity_aware.py
│ │ │ ├── eplb_manager.py
│ │ │ ├── eplb_simulator/
│ │ │ │ ├── __init__.py
│ │ │ │ └── reader.py
│ │ │ ├── expert_distribution.py
│ │ │ ├── expert_location.py
│ │ │ ├── expert_location_dispatch.py
│ │ │ └── expert_location_updater.py
│ │ ├── function_call/
│ │ │ ├── base_format_detector.py
│ │ │ ├── core_types.py
│ │ │ ├── deepseekv31_detector.py
│ │ │ ├── deepseekv32_detector.py
│ │ │ ├── deepseekv3_detector.py
│ │ │ ├── function_call_parser.py
│ │ │ ├── gigachat3_detector.py
│ │ │ ├── glm47_moe_detector.py
│ │ │ ├── glm4_moe_detector.py
│ │ │ ├── gpt_oss_detector.py
│ │ │ ├── hermes_detector.py
│ │ │ ├── internlm_detector.py
│ │ │ ├── json_array_parser.py
│ │ │ ├── kimik2_detector.py
│ │ │ ├── lfm2_detector.py
│ │ │ ├── llama32_detector.py
│ │ │ ├── mimo_detector.py
│ │ │ ├── minimax_m2.py
│ │ │ ├── mistral_detector.py
│ │ │ ├── pythonic_detector.py
│ │ │ ├── qwen25_detector.py
│ │ │ ├── qwen3_coder_detector.py
│ │ │ ├── step3_detector.py
│ │ │ ├── trinity_detector.py
│ │ │ └── utils.py
│ │ ├── grpc/
│ │ │ └── __init__.py
│ │ ├── hardware_backend/
│ │ │ └── npu/
│ │ │ ├── allocator_npu.py
│ │ │ ├── attention/
│ │ │ │ ├── ascend_backend.py
│ │ │ │ ├── ascend_torch_native_backend.py
│ │ │ │ └── mla_preprocess.py
│ │ │ ├── cmo.py
│ │ │ ├── graph_runner/
│ │ │ │ ├── eagle_draft_extend_npu_graph_runner.py
│ │ │ │ ├── eagle_draft_npu_graph_runner.py
│ │ │ │ ├── npu_graph_runner.py
│ │ │ │ └── vit_npu_graph_runner.py
│ │ │ ├── memory_pool_npu.py
│ │ │ ├── modules/
│ │ │ │ ├── deepseek_v2_attention_mla_npu.py
│ │ │ │ └── qwen_vl_processor.py
│ │ │ ├── moe/
│ │ │ │ └── topk.py
│ │ │ ├── quantization/
│ │ │ │ ├── fused_moe_method_npu.py
│ │ │ │ └── linear_method_npu.py
│ │ │ └── utils.py
│ │ ├── layers/
│ │ │ ├── activation.py
│ │ │ ├── amx_utils.py
│ │ │ ├── attention/
│ │ │ │ ├── aiter_backend.py
│ │ │ │ ├── attention_registry.py
│ │ │ │ ├── base_attn_backend.py
│ │ │ │ ├── cutlass_mla_backend.py
│ │ │ │ ├── double_sparsity_backend.py
│ │ │ │ ├── dual_chunk_flashattention_backend.py
│ │ │ │ ├── fla/
│ │ │ │ │ ├── chunk.py
│ │ │ │ │ ├── chunk_delta_h.py
│ │ │ │ │ ├── chunk_o.py
│ │ │ │ │ ├── chunk_scaled_dot_kkt.py
│ │ │ │ │ ├── cumsum.py
│ │ │ │ │ ├── fused_gdn_gating.py
│ │ │ │ │ ├── fused_norm_gate.py
│ │ │ │ │ ├── fused_recurrent.py
│ │ │ │ │ ├── fused_sigmoid_gating_recurrent.py
│ │ │ │ │ ├── index.py
│ │ │ │ │ ├── kda.py
│ │ │ │ │ ├── l2norm.py
│ │ │ │ │ ├── layernorm_gated.py
│ │ │ │ │ ├── op.py
│ │ │ │ │ ├── solve_tril.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── wy_fast.py
│ │ │ │ ├── flashattention_backend.py
│ │ │ │ ├── flashinfer_backend.py
│ │ │ │ ├── flashinfer_mla_backend.py
│ │ │ │ ├── flashmla_backend.py
│ │ │ │ ├── hybrid_attn_backend.py
│ │ │ │ ├── hybrid_linear_attn_backend.py
│ │ │ │ ├── intel_amx_backend.py
│ │ │ │ ├── linear/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── gdn_backend.py
│ │ │ │ │ ├── kda_backend.py
│ │ │ │ │ ├── kernels/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── gdn_cutedsl.py
│ │ │ │ │ │ ├── gdn_flashinfer.py
│ │ │ │ │ │ ├── gdn_triton.py
│ │ │ │ │ │ ├── kda_triton.py
│ │ │ │ │ │ └── kernel_backend.py
│ │ │ │ │ ├── lightning_attn.py
│ │ │ │ │ ├── lightning_backend.py
│ │ │ │ │ ├── linear_metadata.py
│ │ │ │ │ ├── seg_la.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── mamba/
│ │ │ │ │ ├── causal_conv1d.py
│ │ │ │ │ ├── causal_conv1d_triton.py
│ │ │ │ │ ├── mamba.py
│ │ │ │ │ ├── mamba2_metadata.py
│ │ │ │ │ ├── mamba_state_scatter_triton.py
│ │ │ │ │ ├── mixer2_rms_norm_gated.py
│ │ │ │ │ └── ops/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── layernorm_gated.py
│ │ │ │ │ ├── mamba_ssm.py
│ │ │ │ │ ├── ssd_bmm.py
│ │ │ │ │ ├── ssd_chunk_scan.py
│ │ │ │ │ ├── ssd_chunk_state.py
│ │ │ │ │ ├── ssd_combined.py
│ │ │ │ │ ├── ssd_state_passing.py
│ │ │ │ │ └── ssu_dispatch.py
│ │ │ │ ├── merge_state.py
│ │ │ │ ├── nsa/
│ │ │ │ │ ├── dequant_k_cache.py
│ │ │ │ │ ├── index_buf_accessor.py
│ │ │ │ │ ├── nsa_backend_mtp_precompute.py
│ │ │ │ │ ├── nsa_indexer.py
│ │ │ │ │ ├── nsa_mtp_verification.py
│ │ │ │ │ ├── quant_k_cache.py
│ │ │ │ │ ├── tilelang_kernel.py
│ │ │ │ │ ├── transform_index.py
│ │ │ │ │ ├── triton_kernel.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── nsa_backend.py
│ │ │ │ ├── tbo_backend.py
│ │ │ │ ├── torch_flex_backend.py
│ │ │ │ ├── torch_native_backend.py
│ │ │ │ ├── triton_backend.py
│ │ │ │ ├── triton_ops/
│ │ │ │ │ ├── decode_attention.py
│ │ │ │ │ ├── double_sparsity_attention.py
│ │ │ │ │ ├── extend_attention.py
│ │ │ │ │ ├── merge_state.py
│ │ │ │ │ ├── prefill_attention.py
│ │ │ │ │ ├── rocm_mla_decode_rope.py
│ │ │ │ │ └── trtllm_fp8_kv_kernel.py
│ │ │ │ ├── trtllm_mha_backend.py
│ │ │ │ ├── trtllm_mla_backend.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── vision.py
│ │ │ │ ├── vision_utils.py
│ │ │ │ ├── wave_backend.py
│ │ │ │ ├── wave_ops/
│ │ │ │ │ ├── decode_attention.py
│ │ │ │ │ ├── extend_attention.py
│ │ │ │ │ └── prefill_attention.py
│ │ │ │ └── xpu_backend.py
│ │ │ ├── communicator.py
│ │ │ ├── communicator_nsa_cp.py
│ │ │ ├── conv.py
│ │ │ ├── deep_gemm_wrapper/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── compile_utils.py
│ │ │ │ ├── configurer.py
│ │ │ │ └── entrypoint.py
│ │ │ ├── dp_attention.py
│ │ │ ├── elementwise.py
│ │ │ ├── flashinfer_comm_fusion.py
│ │ │ ├── int4fp8_utils.py
│ │ │ ├── layernorm.py
│ │ │ ├── linear.py
│ │ │ ├── logits_processor.py
│ │ │ ├── model_parallel.py
│ │ │ ├── modelopt_utils.py
│ │ │ ├── moe/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cutlass_moe.py
│ │ │ │ ├── cutlass_moe_params.py
│ │ │ │ ├── cutlass_w4a8_moe.py
│ │ │ │ ├── ep_moe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── kernels.py
│ │ │ │ │ └── layer.py
│ │ │ │ ├── flashinfer_cutedsl_moe.py
│ │ │ │ ├── flashinfer_trtllm_moe.py
│ │ │ │ ├── fused_moe_native.py
│ │ │ │ ├── fused_moe_triton/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── triton_3_1_0/
│ │ │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ └── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ ├── triton_3_2_0/
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=96,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── triton_3_3_0/
│ │ │ │ │ │ │ └── E=16,N=1024,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── triton_3_3_1/
│ │ │ │ │ │ │ ├── E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=320,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── triton_3_4_0/
│ │ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=512,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=256,N=512,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ │ ├── E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=,dtype=int4_w4a16.json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=,dtype=int4_w4a16_down.json
│ │ │ │ │ │ │ ├── E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ └── E=512,N=64,device_name=NVIDIA_H200.json
│ │ │ │ │ │ └── triton_3_5_1/
│ │ │ │ │ │ ├── E=128,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=232,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=464,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=16,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=16,N=2048,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json
│ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=20,N=1536,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=256,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=256,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=256,N=672,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=32,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=32,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=512,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=512,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=336,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=672,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=464,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ └── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ ├── fused_marlin_moe.py
│ │ │ │ │ ├── fused_moe.py
│ │ │ │ │ ├── fused_moe_triton_config.py
│ │ │ │ │ ├── fused_moe_triton_kernels.py
│ │ │ │ │ ├── layer.py
│ │ │ │ │ ├── moe_align_block_size.py
│ │ │ │ │ └── triton_kernels_moe.py
│ │ │ │ ├── kt_ep_wrapper.py
│ │ │ │ ├── moe_runner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── deep_gemm.py
│ │ │ │ │ ├── flashinfer_trtllm.py
│ │ │ │ │ ├── marlin.py
│ │ │ │ │ ├── runner.py
│ │ │ │ │ ├── triton.py
│ │ │ │ │ └── triton_kernels.py
│ │ │ │ ├── rocm_moe_utils.py
│ │ │ │ ├── routed_experts_capturer.py
│ │ │ │ ├── router.py
│ │ │ │ ├── token_dispatcher/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── deepep.py
│ │ │ │ │ ├── flashinfer.py
│ │ │ │ │ ├── flashinfer_utils.py
│ │ │ │ │ ├── fuseep.py
│ │ │ │ │ ├── mooncake.py
│ │ │ │ │ ├── moriep.py
│ │ │ │ │ ├── nixl.py
│ │ │ │ │ └── standard.py
│ │ │ │ ├── topk.py
│ │ │ │ └── utils.py
│ │ │ ├── multimodal.py
│ │ │ ├── n_gram_embedding.py
│ │ │ ├── parameter.py
│ │ │ ├── pooler.py
│ │ │ ├── quantization/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── auto_round.py
│ │ │ │ ├── awq.py
│ │ │ │ ├── awq_triton.py
│ │ │ │ ├── base_config.py
│ │ │ │ ├── base_scheme.py
│ │ │ │ ├── bitsandbytes.py
│ │ │ │ ├── blockwise_int8.py
│ │ │ │ ├── compressed_tensors/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── compressed_tensors.py
│ │ │ │ │ ├── schemes/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── compressed_tensors_scheme.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_mxint4_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_nvfp4.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_nvfp4_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w4a8_int8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w8a16_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_int8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_int8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_wNa16.py
│ │ │ │ │ │ └── compressed_tensors_wNa16_moe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── configs/
│ │ │ │ │ ├── N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=4096,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=2048,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ └── README.md
│ │ │ │ ├── fp4_utils.py
│ │ │ │ ├── fp8.py
│ │ │ │ ├── fp8_kernel.py
│ │ │ │ ├── fp8_utils.py
│ │ │ │ ├── fpgemm_fp8.py
│ │ │ │ ├── gguf.py
│ │ │ │ ├── gptq.py
│ │ │ │ ├── int8_kernel.py
│ │ │ │ ├── int8_utils.py
│ │ │ │ ├── kv_cache.py
│ │ │ │ ├── kvfp4_tensor.py
│ │ │ │ ├── marlin_utils.py
│ │ │ │ ├── marlin_utils_fp8.py
│ │ │ │ ├── modelopt_quant.py
│ │ │ │ ├── modelslim/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── modelslim.py
│ │ │ │ │ └── schemes/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modelslim_scheme.py
│ │ │ │ │ ├── modelslim_w4a4_int4.py
│ │ │ │ │ ├── modelslim_w4a4_int4_moe.py
│ │ │ │ │ ├── modelslim_w4a8_int8_moe.py
│ │ │ │ │ ├── modelslim_w8a8_int8.py
│ │ │ │ │ └── modelslim_w8a8_int8_moe.py
│ │ │ │ ├── moe_wna16.py
│ │ │ │ ├── mxfp4.py
│ │ │ │ ├── mxfp4_tensor.py
│ │ │ │ ├── petit.py
│ │ │ │ ├── petit_utils.py
│ │ │ │ ├── qoq.py
│ │ │ │ ├── quark/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── quark.py
│ │ │ │ │ ├── schemes/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── quark_scheme.py
│ │ │ │ │ │ ├── quark_w4a4_mxfp4.py
│ │ │ │ │ │ ├── quark_w4a4_mxfp4_moe.py
│ │ │ │ │ │ ├── quark_w8a8_fp8.py
│ │ │ │ │ │ └── quark_w8a8_fp8_moe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── quark_int4fp8_moe.py
│ │ │ │ ├── rocm_mxfp4_utils.py
│ │ │ │ ├── unquant.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── w4afp8.py
│ │ │ │ ├── w8a8_fp8.py
│ │ │ │ └── w8a8_int8.py
│ │ │ ├── radix_attention.py
│ │ │ ├── radix_linear_attention.py
│ │ │ ├── rocm_linear_utils.py
│ │ │ ├── rotary_embedding/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── factory.py
│ │ │ │ ├── mrope.py
│ │ │ │ ├── mrope_rope_index.py
│ │ │ │ ├── rope_variant.py
│ │ │ │ ├── triton_kernels.py
│ │ │ │ ├── utils.py
│ │ │ │ └── yarn.py
│ │ │ ├── sampler.py
│ │ │ ├── sparse_pooler.py
│ │ │ ├── torchao_utils.py
│ │ │ ├── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common.py
│ │ │ │ ├── hash.py
│ │ │ │ ├── logprob.py
│ │ │ │ └── multi_platform.py
│ │ │ └── vocab_parallel_embedding.py
│ │ ├── lora/
│ │ │ ├── backend/
│ │ │ │ ├── ascend_backend.py
│ │ │ │ ├── base_backend.py
│ │ │ │ ├── chunked_backend.py
│ │ │ │ ├── lmhead_mixing.py
│ │ │ │ ├── lora_registry.py
│ │ │ │ ├── torch_backend.py
│ │ │ │ └── triton_backend.py
│ │ │ ├── eviction_policy.py
│ │ │ ├── layers.py
│ │ │ ├── lora.py
│ │ │ ├── lora_config.py
│ │ │ ├── lora_manager.py
│ │ │ ├── lora_overlap_loader.py
│ │ │ ├── lora_registry.py
│ │ │ ├── mem_pool.py
│ │ │ ├── torch_ops/
│ │ │ │ ├── __init__.py
│ │ │ │ └── lora_ops.py
│ │ │ ├── triton_ops/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── chunked_embedding_lora_a.py
│ │ │ │ ├── chunked_sgmv_expand.py
│ │ │ │ ├── chunked_sgmv_shrink.py
│ │ │ │ ├── embedding_lora_a.py
│ │ │ │ ├── fused_moe_lora_kernel.py
│ │ │ │ ├── gate_up_lora_b.py
│ │ │ │ ├── qkv_lora_b.py
│ │ │ │ ├── sgemm_lora_a.py
│ │ │ │ └── sgemm_lora_b.py
│ │ │ └── utils.py
│ │ ├── managers/
│ │ │ ├── async_dynamic_batch_tokenizer.py
│ │ │ ├── async_mm_data_processor.py
│ │ │ ├── cache_controller.py
│ │ │ ├── configure_logging.py
│ │ │ ├── data_parallel_controller.py
│ │ │ ├── detokenizer_manager.py
│ │ │ ├── disagg_service.py
│ │ │ ├── io_struct.py
│ │ │ ├── mm_utils.py
│ │ │ ├── multi_tokenizer_mixin.py
│ │ │ ├── multimodal_processor.py
│ │ │ ├── overlap_utils.py
│ │ │ ├── prefill_delayer.py
│ │ │ ├── schedule_batch.py
│ │ │ ├── schedule_policy.py
│ │ │ ├── scheduler.py
│ │ │ ├── scheduler_dp_attn_mixin.py
│ │ │ ├── scheduler_input_blocker.py
│ │ │ ├── scheduler_output_processor_mixin.py
│ │ │ ├── scheduler_pp_mixin.py
│ │ │ ├── scheduler_profiler_mixin.py
│ │ │ ├── scheduler_recv_skipper.py
│ │ │ ├── scheduler_runtime_checker_mixin.py
│ │ │ ├── scheduler_update_weights_mixin.py
│ │ │ ├── session_controller.py
│ │ │ ├── template_manager.py
│ │ │ ├── tokenizer_communicator_mixin.py
│ │ │ ├── tokenizer_manager.py
│ │ │ ├── tokenizer_manager_multiitem_mixin.py
│ │ │ ├── tp_worker.py
│ │ │ └── utils.py
│ │ ├── mem_cache/
│ │ │ ├── allocator.py
│ │ │ ├── base_prefix_cache.py
│ │ │ ├── cache_init_params.py
│ │ │ ├── chunk_cache.py
│ │ │ ├── common.py
│ │ │ ├── cpp_radix_tree/
│ │ │ │ ├── common.h
│ │ │ │ ├── radix_tree.py
│ │ │ │ ├── tree_v2.cpp
│ │ │ │ ├── tree_v2.h
│ │ │ │ ├── tree_v2_binding.cpp
│ │ │ │ ├── tree_v2_debug.cpp
│ │ │ │ ├── tree_v2_impl.h
│ │ │ │ └── tree_v2_node.h
│ │ │ ├── evict_policy.py
│ │ │ ├── flush_cache.py
│ │ │ ├── hi_mamba_radix_cache.py
│ │ │ ├── hicache_storage.py
│ │ │ ├── hiradix_cache.py
│ │ │ ├── mamba_radix_cache.py
│ │ │ ├── memory_pool.py
│ │ │ ├── memory_pool_host.py
│ │ │ ├── multimodal_cache.py
│ │ │ ├── radix_cache.py
│ │ │ ├── radix_cache_cpp.py
│ │ │ ├── session_aware_cache.py
│ │ │ ├── sparsity/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── algorithms/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_algorithm.py
│ │ │ │ │ ├── deepseek_nsa.py
│ │ │ │ │ └── quest_algorithm.py
│ │ │ │ ├── backend/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── backend_adaptor.py
│ │ │ │ ├── core/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── sparse_coordinator.py
│ │ │ │ └── factory.py
│ │ │ ├── storage/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aibrix_kvcache/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── aibrix_kvcache_storage.py
│ │ │ │ │ └── unit_test.py
│ │ │ │ ├── backend_factory.py
│ │ │ │ ├── eic/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── eic_storage.py
│ │ │ │ │ └── test_unit.py
│ │ │ │ ├── hf3fs/
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── deploy_sglang_3fs_multinode.md
│ │ │ │ │ │ └── setup_usrbio_client.md
│ │ │ │ │ ├── hf3fs_client.py
│ │ │ │ │ ├── hf3fs_usrbio_client.py
│ │ │ │ │ ├── hf3fs_utils.cpp
│ │ │ │ │ ├── mini_3fs_metadata_server.py
│ │ │ │ │ ├── storage_hf3fs.py
│ │ │ │ │ └── test_hf3fs_utils.py
│ │ │ │ ├── lmcache/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── example_config.yaml
│ │ │ │ │ ├── lmc_radix_cache.py
│ │ │ │ │ └── unit_test.py
│ │ │ │ ├── mooncake_store/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── embedding_cache_controller.py
│ │ │ │ │ ├── mooncake_embedding_store.py
│ │ │ │ │ ├── mooncake_store.py
│ │ │ │ │ └── test_mooncake_store.py
│ │ │ │ └── nixl/
│ │ │ │ ├── README.md
│ │ │ │ ├── hicache_nixl.py
│ │ │ │ ├── nixl.config.toml.sample
│ │ │ │ ├── nixl_utils.py
│ │ │ │ └── test_hicache_nixl_storage.py
│ │ │ ├── swa_memory_pool.py
│ │ │ ├── swa_radix_cache.py
│ │ │ └── utils.py
│ │ ├── model_executor/
│ │ │ ├── cpu_graph_runner.py
│ │ │ ├── cuda_graph_runner.py
│ │ │ ├── forward_batch_deepseek_mha_mixin.py
│ │ │ ├── forward_batch_info.py
│ │ │ ├── hook_manager.py
│ │ │ ├── input_buffers.py
│ │ │ ├── mindspore_runner.py
│ │ │ ├── model_runner.py
│ │ │ ├── model_runner_kv_cache_mixin.py
│ │ │ └── piecewise_cuda_graph_runner.py
│ │ ├── model_loader/
│ │ │ ├── __init__.py
│ │ │ ├── ci_weight_validation.py
│ │ │ ├── loader.py
│ │ │ ├── remote_instance_weight_loader_utils.py
│ │ │ ├── utils.py
│ │ │ └── weight_utils.py
│ │ ├── models/
│ │ │ ├── afmoe.py
│ │ │ ├── apertus.py
│ │ │ ├── arcee.py
│ │ │ ├── baichuan.py
│ │ │ ├── bailing_moe.py
│ │ │ ├── bailing_moe_linear.py
│ │ │ ├── bailing_moe_nextn.py
│ │ │ ├── bert.py
│ │ │ ├── chatglm.py
│ │ │ ├── clip.py
│ │ │ ├── commandr.py
│ │ │ ├── dbrx.py
│ │ │ ├── deepseek.py
│ │ │ ├── deepseek_common/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── attention_backend_handler.py
│ │ │ │ ├── attention_forward_methods/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── forward_methods.py
│ │ │ │ │ ├── forward_mha.py
│ │ │ │ │ ├── forward_mla.py
│ │ │ │ │ ├── forward_mla_fused_rope_cpu.py
│ │ │ │ │ └── forward_mla_fused_rope_rocm.py
│ │ │ │ ├── deepseek_weight_loader.py
│ │ │ │ └── utils.py
│ │ │ ├── deepseek_janus_pro.py
│ │ │ ├── deepseek_nextn.py
│ │ │ ├── deepseek_ocr.py
│ │ │ ├── deepseek_v2.py
│ │ │ ├── deepseek_vl2.py
│ │ │ ├── dots_ocr.py
│ │ │ ├── dots_vlm.py
│ │ │ ├── dots_vlm_vit.py
│ │ │ ├── ernie4.py
│ │ │ ├── ernie45_moe_vl.py
│ │ │ ├── ernie45_vl.py
│ │ │ ├── ernie4_eagle.py
│ │ │ ├── exaone.py
│ │ │ ├── exaone4.py
│ │ │ ├── exaone_moe.py
│ │ │ ├── exaone_moe_mtp.py
│ │ │ ├── falcon_h1.py
│ │ │ ├── gemma.py
│ │ │ ├── gemma2.py
│ │ │ ├── gemma2_reward.py
│ │ │ ├── gemma3_causal.py
│ │ │ ├── gemma3_mm.py
│ │ │ ├── gemma3n_audio.py
│ │ │ ├── gemma3n_causal.py
│ │ │ ├── gemma3n_mm.py
│ │ │ ├── glm4.py
│ │ │ ├── glm4_moe.py
│ │ │ ├── glm4_moe_lite.py
│ │ │ ├── glm4_moe_nextn.py
│ │ │ ├── glm4v.py
│ │ │ ├── glm4v_moe.py
│ │ │ ├── glm_ocr.py
│ │ │ ├── glm_ocr_nextn.py
│ │ │ ├── glmasr.py
│ │ │ ├── gpt2.py
│ │ │ ├── gpt_bigcode.py
│ │ │ ├── gpt_j.py
│ │ │ ├── gpt_oss.py
│ │ │ ├── granite.py
│ │ │ ├── granitemoe.py
│ │ │ ├── granitemoehybrid.py
│ │ │ ├── grok.py
│ │ │ ├── hunyuan.py
│ │ │ ├── idefics2.py
│ │ │ ├── internlm2.py
│ │ │ ├── internlm2_reward.py
│ │ │ ├── interns1.py
│ │ │ ├── interns1pro.py
│ │ │ ├── internvl.py
│ │ │ ├── iquest_loopcoder.py
│ │ │ ├── jet_nemotron.py
│ │ │ ├── jet_vlm.py
│ │ │ ├── kimi_k25.py
│ │ │ ├── kimi_linear.py
│ │ │ ├── kimi_vl.py
│ │ │ ├── kimi_vl_moonvit.py
│ │ │ ├── lfm2.py
│ │ │ ├── lfm2_moe.py
│ │ │ ├── lightonocr.py
│ │ │ ├── llada2.py
│ │ │ ├── llama.py
│ │ │ ├── llama4.py
│ │ │ ├── llama_classification.py
│ │ │ ├── llama_eagle.py
│ │ │ ├── llama_eagle3.py
│ │ │ ├── llama_embedding.py
│ │ │ ├── llama_reward.py
│ │ │ ├── llava.py
│ │ │ ├── llavavid.py
│ │ │ ├── longcat_flash.py
│ │ │ ├── longcat_flash_nextn.py
│ │ │ ├── midashenglm.py
│ │ │ ├── mimo.py
│ │ │ ├── mimo_mtp.py
│ │ │ ├── mimo_v2_flash.py
│ │ │ ├── mimo_v2_flash_nextn.py
│ │ │ ├── mindspore.py
│ │ │ ├── minicpm.py
│ │ │ ├── minicpm3.py
│ │ │ ├── minicpmo.py
│ │ │ ├── minicpmv.py
│ │ │ ├── minimax_m2.py
│ │ │ ├── ministral3.py
│ │ │ ├── mistral.py
│ │ │ ├── mistral_large_3.py
│ │ │ ├── mistral_large_3_eagle.py
│ │ │ ├── mixtral.py
│ │ │ ├── mixtral_quant.py
│ │ │ ├── mllama.py
│ │ │ ├── mllama4.py
│ │ │ ├── nano_nemotron_vl.py
│ │ │ ├── nemotron_h.py
│ │ │ ├── nemotron_h_mtp.py
│ │ │ ├── nemotron_nas.py
│ │ │ ├── nvila.py
│ │ │ ├── nvila_lite.py
│ │ │ ├── olmo.py
│ │ │ ├── olmo2.py
│ │ │ ├── olmoe.py
│ │ │ ├── opt.py
│ │ │ ├── orion.py
│ │ │ ├── paddleocr_vl.py
│ │ │ ├── persimmon.py
│ │ │ ├── phi.py
│ │ │ ├── phi3_small.py
│ │ │ ├── phi4mm.py
│ │ │ ├── phi4mm_audio.py
│ │ │ ├── phi4mm_utils.py
│ │ │ ├── phimoe.py
│ │ │ ├── pixtral.py
│ │ │ ├── points_v15_chat.py
│ │ │ ├── qwen.py
│ │ │ ├── qwen2.py
│ │ │ ├── qwen2_5_vl.py
│ │ │ ├── qwen2_audio.py
│ │ │ ├── qwen2_classification.py
│ │ │ ├── qwen2_eagle.py
│ │ │ ├── qwen2_moe.py
│ │ │ ├── qwen2_rm.py
│ │ │ ├── qwen2_vl.py
│ │ │ ├── qwen3.py
│ │ │ ├── qwen3_5.py
│ │ │ ├── qwen3_5_mtp.py
│ │ │ ├── qwen3_classification.py
│ │ │ ├── qwen3_moe.py
│ │ │ ├── qwen3_next.py
│ │ │ ├── qwen3_next_mtp.py
│ │ │ ├── qwen3_omni_moe.py
│ │ │ ├── qwen3_rm.py
│ │ │ ├── qwen3_vl.py
│ │ │ ├── qwen3_vl_moe.py
│ │ │ ├── radio.py
│ │ │ ├── registry.py
│ │ │ ├── roberta.py
│ │ │ ├── sarashina2_vision.py
│ │ │ ├── sarvam_moe.py
│ │ │ ├── sdar.py
│ │ │ ├── sdar_moe.py
│ │ │ ├── siglip.py
│ │ │ ├── solar.py
│ │ │ ├── stablelm.py
│ │ │ ├── starcoder2.py
│ │ │ ├── step3_vl.py
│ │ │ ├── step3_vl_10b.py
│ │ │ ├── step3p5.py
│ │ │ ├── step3p5_mtp.py
│ │ │ ├── teleflm.py
│ │ │ ├── torch_native_llama.py
│ │ │ ├── transformers.py
│ │ │ ├── utils.py
│ │ │ ├── whisper.py
│ │ │ ├── xverse.py
│ │ │ ├── xverse_moe.py
│ │ │ └── yivl.py
│ │ ├── multimodal/
│ │ │ ├── customized_mm_processor_utils.py
│ │ │ ├── evs/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── evs_core.py
│ │ │ │ ├── evs_module.py
│ │ │ │ └── evs_processor.py
│ │ │ ├── internvl_utils.py
│ │ │ ├── internvl_vit_cuda_graph_runner.py
│ │ │ ├── mm_utils.py
│ │ │ ├── processors/
│ │ │ │ ├── base_processor.py
│ │ │ │ ├── clip.py
│ │ │ │ ├── deepseek_ocr.py
│ │ │ │ ├── deepseek_vl_v2.py
│ │ │ │ ├── dots_vlm.py
│ │ │ │ ├── ernie45_vl.py
│ │ │ │ ├── gemma3.py
│ │ │ │ ├── gemma3n.py
│ │ │ │ ├── glm4v.py
│ │ │ │ ├── glmasr.py
│ │ │ │ ├── interns1pro.py
│ │ │ │ ├── internvl.py
│ │ │ │ ├── janus_pro.py
│ │ │ │ ├── kimi_k25.py
│ │ │ │ ├── kimi_vl.py
│ │ │ │ ├── lightonocr.py
│ │ │ │ ├── llava.py
│ │ │ │ ├── midashenglm.py
│ │ │ │ ├── minicpm.py
│ │ │ │ ├── mlama.py
│ │ │ │ ├── mllama4.py
│ │ │ │ ├── nano_nemotron_vl.py
│ │ │ │ ├── nvila.py
│ │ │ │ ├── paddleocr_vlm.py
│ │ │ │ ├── phi4mm.py
│ │ │ │ ├── pixtral.py
│ │ │ │ ├── points_v15_chat.py
│ │ │ │ ├── qwen_audio.py
│ │ │ │ ├── qwen_vl.py
│ │ │ │ ├── sarashina2_vision.py
│ │ │ │ ├── step3_vl.py
│ │ │ │ └── whisper.py
│ │ │ └── vit_cuda_graph_runner.py
│ │ ├── multiplex/
│ │ │ ├── multiplexing_mixin.py
│ │ │ └── pdmux_context.py
│ │ ├── observability/
│ │ │ ├── cpu_monitor.py
│ │ │ ├── func_timer.py
│ │ │ ├── label_transform.py
│ │ │ ├── metrics_collector.py
│ │ │ ├── req_time_stats.py
│ │ │ ├── request_metrics_exporter.py
│ │ │ ├── scheduler_metrics_mixin.py
│ │ │ ├── startup_func_log_and_timer.py
│ │ │ ├── trace.py
│ │ │ └── utils.py
│ │ ├── parser/
│ │ │ ├── code_completion_parser.py
│ │ │ ├── conversation.py
│ │ │ ├── harmony_parser.py
│ │ │ ├── jinja_template_utils.py
│ │ │ └── reasoning_parser.py
│ │ ├── ray/
│ │ │ ├── __init__.py
│ │ │ ├── engine.py
│ │ │ ├── http_server.py
│ │ │ └── scheduler_actor.py
│ │ ├── sampling/
│ │ │ ├── custom_logit_processor.py
│ │ │ ├── penaltylib/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── frequency_penalty.py
│ │ │ │ ├── min_new_tokens.py
│ │ │ │ ├── orchestrator.py
│ │ │ │ └── presence_penalty.py
│ │ │ ├── sampling_batch_info.py
│ │ │ └── sampling_params.py
│ │ ├── server_args.py
│ │ ├── server_args_config_parser.py
│ │ ├── speculative/
│ │ │ ├── base_spec_worker.py
│ │ │ ├── cpp_ngram/
│ │ │ │ ├── .clang-format
│ │ │ │ ├── ngram.cpp
│ │ │ │ ├── ngram.h
│ │ │ │ ├── ngram_cache.py
│ │ │ │ ├── ngram_cache_binding.cpp
│ │ │ │ ├── param.h
│ │ │ │ └── queue.h
│ │ │ ├── draft_utils.py
│ │ │ ├── eagle_draft_cuda_graph_runner.py
│ │ │ ├── eagle_draft_extend_cuda_graph_runner.py
│ │ │ ├── eagle_info.py
│ │ │ ├── eagle_info_v2.py
│ │ │ ├── eagle_utils.py
│ │ │ ├── eagle_worker.py
│ │ │ ├── eagle_worker_v2.py
│ │ │ ├── multi_layer_eagle_draft_extend_cuda_graph_runner.py
│ │ │ ├── multi_layer_eagle_utils.py
│ │ │ ├── multi_layer_eagle_worker.py
│ │ │ ├── multi_layer_eagle_worker_v2.py
│ │ │ ├── ngram_info.py
│ │ │ ├── ngram_worker.py
│ │ │ ├── spec_info.py
│ │ │ ├── spec_utils.py
│ │ │ ├── standalone_worker.py
│ │ │ └── standalone_worker_v2.py
│ │ ├── tokenizer/
│ │ │ └── tiktoken_tokenizer.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── aio_rwlock.py
│ │ │ ├── auth.py
│ │ │ ├── bench_utils.py
│ │ │ ├── common.py
│ │ │ ├── cuda_ipc_transport_utils.py
│ │ │ ├── custom_op.py
│ │ │ ├── device_timer.py
│ │ │ ├── gauge_histogram.py
│ │ │ ├── hf_transformers_utils.py
│ │ │ ├── host_shared_memory.py
│ │ │ ├── json_response.py
│ │ │ ├── log_utils.py
│ │ │ ├── mistral_utils.py
│ │ │ ├── model_file_verifier.py
│ │ │ ├── multi_stream_utils.py
│ │ │ ├── network.py
│ │ │ ├── numa_utils.py
│ │ │ ├── nvtx_pytorch_hooks.py
│ │ │ ├── offloader.py
│ │ │ ├── patch_tokenizer.py
│ │ │ ├── patch_torch.py
│ │ │ ├── poll_based_barrier.py
│ │ │ ├── profile_merger.py
│ │ │ ├── profile_utils.py
│ │ │ ├── request_logger.py
│ │ │ ├── rpd_utils.py
│ │ │ ├── scheduler_status_logger.py
│ │ │ ├── slow_rank_detector.py
│ │ │ ├── torch_memory_saver_adapter.py
│ │ │ ├── video_decoder.py
│ │ │ ├── watchdog.py
│ │ │ └── weight_checker.py
│ │ └── weight_sync/
│ │ ├── tensor_bucket.py
│ │ └── utils.py
│ ├── test/
│ │ ├── __init__.py
│ │ ├── accuracy_test_runner.py
│ │ ├── ascend/
│ │ │ ├── __init__.py
│ │ │ ├── disaggregation_utils.py
│ │ │ ├── gsm8k_ascend_mixin.py
│ │ │ ├── test_ascend_utils.py
│ │ │ └── vlm_utils.py
│ │ ├── attention/
│ │ │ ├── __init__.py
│ │ │ ├── test_flashattn_backend.py
│ │ │ ├── test_flashattn_mla_backend.py
│ │ │ ├── test_prefix_chunk_info.py
│ │ │ └── test_trtllm_mla_backend.py
│ │ ├── bench_one_batch_server_internal.py
│ │ ├── ci/
│ │ │ ├── __init__.py
│ │ │ ├── ci_register.py
│ │ │ ├── ci_stress_utils.py
│ │ │ ├── ci_utils.py
│ │ │ └── run_with_retry.py
│ │ ├── doc_patch.py
│ │ ├── external_models/
│ │ │ └── custom_qwen2_vl.py
│ │ ├── few_shot_gsm8k.py
│ │ ├── few_shot_gsm8k_engine.py
│ │ ├── get_logits_ut.py
│ │ ├── gpt_oss_common.py
│ │ ├── kits/
│ │ │ ├── abort_timeout_kit.py
│ │ │ ├── cache_hit_kit.py
│ │ │ ├── ebnf_constrained_kit.py
│ │ │ ├── gsm8k_accuracy_kit.py
│ │ │ ├── json_constrained_kit.py
│ │ │ ├── kl_divergence_kit.py
│ │ │ ├── lm_eval_kit.py
│ │ │ ├── matched_stop_kit.py
│ │ │ ├── mmmu_vlm_kit.py
│ │ │ ├── prefix_cache_branching_kit.py
│ │ │ ├── radix_cache_server_kit.py
│ │ │ ├── regex_constrained_kit.py
│ │ │ └── spec_decoding_kit.py
│ │ ├── kl_test_utils.py
│ │ ├── long_prompt.txt
│ │ ├── longbench_v2/
│ │ │ ├── __init__.py
│ │ │ ├── longbench_v2_evaluation.md
│ │ │ ├── test_longbench_v2_eval.py
│ │ │ ├── validate_longbench_v2.py
│ │ │ └── validate_longbench_v2_standalone.py
│ │ ├── lora_utils.py
│ │ ├── nightly_bench_utils.py
│ │ ├── nightly_utils.py
│ │ ├── performance_test_runner.py
│ │ ├── run_combined_tests.py
│ │ ├── run_eval.py
│ │ ├── runners.py
│ │ ├── send_one.py
│ │ ├── server_fixtures/
│ │ │ ├── default_fixture.py
│ │ │ ├── disaggregation_fixture.py
│ │ │ ├── eagle_fixture.py
│ │ │ └── mmmu_fixture.py
│ │ ├── simple_eval_aime25.py
│ │ ├── simple_eval_common.py
│ │ ├── simple_eval_gpqa.py
│ │ ├── simple_eval_gsm8k.py
│ │ ├── simple_eval_humaneval.py
│ │ ├── simple_eval_longbench_v2.py
│ │ ├── simple_eval_math.py
│ │ ├── simple_eval_mgsm.py
│ │ ├── simple_eval_mmlu.py
│ │ ├── simple_eval_mmmu_vlm.py
│ │ ├── speculative/
│ │ │ └── test_spec_utils.py
│ │ ├── test_activation.py
│ │ ├── test_block_fp8.py
│ │ ├── test_block_fp8_deep_gemm_blackwell.py
│ │ ├── test_custom_ops.py
│ │ ├── test_cutlass_moe.py
│ │ ├── test_cutlass_w16a16_moe.py
│ │ ├── test_cutlass_w4a8_moe.py
│ │ ├── test_deepep_utils.py
│ │ ├── test_deterministic.py
│ │ ├── test_deterministic_utils.py
│ │ ├── test_dump_metric.py
│ │ ├── test_dynamic_grad_mode.py
│ │ ├── test_flashinfer_dispatcher.py
│ │ ├── test_http_server_auth.py
│ │ ├── test_kvfp4_quant_dequant.py
│ │ ├── test_layernorm.py
│ │ ├── test_marlin_utils.py
│ │ ├── test_programs.py
│ │ ├── test_utils.py
│ │ ├── tool_call_test_runner.py
│ │ └── vlm_utils.py
│ ├── utils.py
│ └── version.py
├── scripts/
│ ├── check_vram_clear.sh
│ ├── ci/
│ │ ├── amd/
│ │ │ ├── amd_ci_exec.sh
│ │ │ ├── amd_ci_install_dependency.sh
│ │ │ ├── amd_ci_start_container.sh
│ │ │ ├── amd_ci_start_container_disagg.sh
│ │ │ ├── amd_ci_warmup_aiter.py
│ │ │ └── test_rccl_multi_gpu.py
│ │ ├── cuda/
│ │ │ ├── ci_download_flashinfer_cubin.sh
│ │ │ ├── ci_install_deepep.sh
│ │ │ ├── ci_install_dependency.sh
│ │ │ ├── ci_install_gateway_dependencies.sh
│ │ │ ├── ci_start_disaggregation_servers.sh
│ │ │ ├── prepare_runner.sh
│ │ │ ├── warmup_deep_gemm.py
│ │ │ └── warmup_server.py
│ │ ├── musa/
│ │ │ ├── musa_install_dependency.sh
│ │ │ └── rename_wheels_musa.sh
│ │ ├── npu/
│ │ │ ├── npu_ci_install_dependency.sh
│ │ │ └── npu_log_print.sh
│ │ └── utils/
│ │ ├── ci_coverage_report.py
│ │ ├── cleanup_hf_cache.py
│ │ ├── merge_metrics.py
│ │ ├── prevalidate_cached_models.py
│ │ ├── publish_diffusion_gt.py
│ │ ├── publish_traces.py
│ │ ├── query_job_status.py
│ │ ├── runner_utilization_report.py
│ │ ├── save_diffusion_metrics.py
│ │ ├── save_metrics.py
│ │ └── slash_command_handler.py
│ ├── ci_monitor/
│ │ ├── README.md
│ │ ├── ci_failures_analysis.py
│ │ └── post_ci_failures_to_slack.py
│ ├── code_sync/
│ │ ├── check_commits.py
│ │ ├── copy_from_oss.py
│ │ ├── copy_to_oss.py
│ │ ├── guideline.md
│ │ ├── install_github_cli.sh
│ │ └── utils.py
│ ├── convert_otel_2_perfetto.py
│ ├── ensure_vram_clear.sh
│ ├── export_deepseek_nextn.py
│ ├── killall_sglang.sh
│ ├── playground/
│ │ ├── bench_speculative.py
│ │ ├── disaggregation/
│ │ │ ├── cli-logprob.py
│ │ │ ├── cli-so.py
│ │ │ └── cli.py
│ │ ├── frontend_reasoning.ipynb
│ │ ├── load_tokenizer.py
│ │ ├── long_context_example.py
│ │ ├── lora/
│ │ │ ├── analyzer.py
│ │ │ ├── lora_hf_play.py
│ │ │ └── lora_vllm_play.py
│ │ ├── reference_hf.py
│ │ ├── replay_request_dump.py
│ │ └── router/
│ │ ├── test_tree.py
│ │ └── tree.py
│ ├── release/
│ │ ├── README.md
│ │ ├── bump_flashinfer_version.py
│ │ ├── bump_kernel_version.py
│ │ ├── bump_kernel_version_to_sglang.py
│ │ ├── bump_sglang_version.py
│ │ ├── check_kernel_version_to_sglang.py
│ │ ├── commit_and_pr.sh
│ │ ├── commit_and_pr_kernel_to_sglang.sh
│ │ ├── test_utils.py
│ │ └── utils.py
│ ├── sort_testcases_alphabetically.py
│ ├── update_kernel_whl_index.py
│ ├── update_nightly_whl_index.py
│ ├── update_pr_whl_index.py
│ └── version_branch_to_tag.sh
├── sgl-kernel/
│ ├── .clang-format
│ ├── CMakeLists.txt
│ ├── Dockerfile
│ ├── LICENSE
│ ├── Makefile
│ ├── README.md
│ ├── THIRDPARTYNOTICES.txt
│ ├── analyze_whl_kernel_sizes.py
│ ├── benchmark/
│ │ ├── bench_activation.py
│ │ ├── bench_amd_deterministic_allreduce.py
│ │ ├── bench_awq_dequant.py
│ │ ├── bench_cutlass_mla.py
│ │ ├── bench_dsv3_fused_a_gemm.py
│ │ ├── bench_dsv3_router_gemm.py
│ │ ├── bench_es_fp8_blockwise_grouped_gemm.py
│ │ ├── bench_fp4_gemm.py
│ │ ├── bench_fp8_blockwise_gemm.py
│ │ ├── bench_fp8_blockwise_group_gemm.py
│ │ ├── bench_fp8_gemm.py
│ │ ├── bench_int8_gemm.py
│ │ ├── bench_kimi_k2_moe_fused_gate.py
│ │ ├── bench_moe_align_block_size.py
│ │ ├── bench_moe_ep_post_reorder.py
│ │ ├── bench_moe_fused_gate.py
│ │ ├── bench_moe_topk_sigmoid.py
│ │ ├── bench_moe_topk_softmax.py
│ │ ├── bench_mrope.py
│ │ ├── bench_per_tensor_quant_fp8.py
│ │ ├── bench_per_token_group_quant_8bit.py
│ │ ├── bench_per_token_quant_fp8.py
│ │ ├── bench_qserve_w4a8_gemm.py
│ │ ├── bench_rmsnorm.py
│ │ ├── bench_rotary_embedding.py
│ │ ├── bench_sum_scale.py
│ │ └── bench_top_k_top_p_sampling.py
│ ├── build.sh
│ ├── cmake/
│ │ ├── flashmla.cmake
│ │ └── utils.cmake
│ ├── csrc/
│ │ ├── allreduce/
│ │ │ ├── custom_all_reduce.cu
│ │ │ ├── custom_all_reduce.cuh
│ │ │ ├── custom_all_reduce.hip
│ │ │ ├── custom_all_reduce_hip.cuh
│ │ │ ├── deterministic_all_reduce.hip
│ │ │ ├── mscclpp_allreduce.cu
│ │ │ ├── mscclpp_allreduce.cuh
│ │ │ ├── quick_all_reduce.cu
│ │ │ ├── quick_all_reduce.cuh
│ │ │ ├── quick_all_reduce.h
│ │ │ ├── quick_all_reduce_base.h
│ │ │ └── test_mscclpp_allreduce.cu
│ │ ├── attention/
│ │ │ ├── cascade.cu
│ │ │ ├── cutlass_mla_kernel.cu
│ │ │ ├── cutlass_sm100_mla/
│ │ │ │ ├── device/
│ │ │ │ │ └── sm100_mla.hpp
│ │ │ │ └── kernel/
│ │ │ │ ├── sm100_fmha_mla_reduction.hpp
│ │ │ │ ├── sm100_fmha_mla_tma_warpspecialized.hpp
│ │ │ │ └── sm100_mla_tile_scheduler.hpp
│ │ │ ├── merge_attn_states.cu
│ │ │ └── vertical_slash_index.cu
│ │ ├── common_extension.cc
│ │ ├── common_extension_musa.cc
│ │ ├── common_extension_rocm.cc
│ │ ├── cpu/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── aarch64/
│ │ │ │ └── shm.h
│ │ │ ├── activation.cpp
│ │ │ ├── bmm.cpp
│ │ │ ├── common.h
│ │ │ ├── conv3d.cpp
│ │ │ ├── decode.cpp
│ │ │ ├── extend.cpp
│ │ │ ├── flash_attn.cpp
│ │ │ ├── flash_attn.h
│ │ │ ├── gemm.cpp
│ │ │ ├── gemm.h
│ │ │ ├── gemm_fp8.cpp
│ │ │ ├── gemm_int4.cpp
│ │ │ ├── gemm_int8.cpp
│ │ │ ├── interface.cpp
│ │ │ ├── mamba/
│ │ │ │ ├── conv.cpp
│ │ │ │ └── fla.cpp
│ │ │ ├── model/
│ │ │ │ └── qwen3.cpp
│ │ │ ├── moe.cpp
│ │ │ ├── moe_fp8.cpp
│ │ │ ├── moe_int4.cpp
│ │ │ ├── moe_int8.cpp
│ │ │ ├── norm.cpp
│ │ │ ├── numa_utils.cpp
│ │ │ ├── preprocessor.cpp
│ │ │ ├── qkv_proj.cpp
│ │ │ ├── rope.cpp
│ │ │ ├── shm.cpp
│ │ │ ├── shm.h
│ │ │ ├── topk.cpp
│ │ │ ├── torch_extension_cpu.cpp
│ │ │ ├── vec.h
│ │ │ ├── vec_pack.h
│ │ │ └── x86_64/
│ │ │ └── shm.h
│ │ ├── cutlass_extensions/
│ │ │ ├── common.hpp
│ │ │ ├── detail/
│ │ │ │ └── collective/
│ │ │ │ └── mixed_input_utils.hpp
│ │ │ ├── epilogue/
│ │ │ │ └── epilogue_per_row_per_col_scale.h
│ │ │ └── gemm/
│ │ │ ├── collective/
│ │ │ │ ├── builders/
│ │ │ │ │ └── sm90_gmma_builder_mixed_input.inl
│ │ │ │ ├── collective_builder_mixed_input.hpp
│ │ │ │ ├── collective_mma_array_mixed_input.hpp
│ │ │ │ └── sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
│ │ │ ├── cutlass_gemm_caller.cuh
│ │ │ ├── dispatch_policy.hpp
│ │ │ ├── fp8_blockwise_gemm_sm90_dispatch.cuh
│ │ │ ├── gemm_universal_base_compat.h
│ │ │ └── gemm_with_epilogue_visitor.h
│ │ ├── elementwise/
│ │ │ ├── activation.cu
│ │ │ ├── cast.cu
│ │ │ ├── concat_mla.cu
│ │ │ ├── copy.cu
│ │ │ ├── fused_add_rms_norm_kernel.cu
│ │ │ ├── pos_enc.cu
│ │ │ ├── pos_enc.cuh
│ │ │ ├── topk.cu
│ │ │ └── utils.cuh
│ │ ├── expert_specialization/
│ │ │ ├── es_fp8_blockwise.cu
│ │ │ ├── es_fp8_blockwise_functor.cuh
│ │ │ ├── es_fp8_blockwise_launcher.cuh
│ │ │ ├── es_fp8_blockwise_traits.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled.cu
│ │ │ ├── es_sm100_mxfp8_blockscaled_functor.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled_group_quant.cu
│ │ │ ├── es_sm100_mxfp8_blockscaled_group_quant.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled_launcher.cuh
│ │ │ └── es_sm100_mxfp8_blockscaled_traits.cuh
│ │ ├── flash_extension.cc
│ │ ├── flashmla_extension.cc
│ │ ├── gemm/
│ │ │ ├── awq_kernel.cu
│ │ │ ├── bmm_fp8.cu
│ │ │ ├── dsv3_fused_a_gemm.cu
│ │ │ ├── dsv3_router_gemm_bf16_out.cu
│ │ │ ├── dsv3_router_gemm_entry.cu
│ │ │ ├── dsv3_router_gemm_float_out.cu
│ │ │ ├── fp8_blockwise_gemm_kernel.cu
│ │ │ ├── fp8_gemm_kernel.cu
│ │ │ ├── gptq/
│ │ │ │ ├── compat.cuh
│ │ │ │ ├── gptq_kernel.cu
│ │ │ │ ├── matrix_view.cuh
│ │ │ │ ├── qdq_2.cuh
│ │ │ │ ├── qdq_3.cuh
│ │ │ │ ├── qdq_4.cuh
│ │ │ │ ├── qdq_8.cuh
│ │ │ │ └── qdq_util.cuh
│ │ │ ├── int8_gemm_kernel.cu
│ │ │ ├── marlin/
│ │ │ │ ├── dequant.h
│ │ │ │ ├── kernel.h
│ │ │ │ ├── marlin.cuh
│ │ │ │ ├── marlin_dtypes.cuh
│ │ │ │ └── marlin_template.h
│ │ │ ├── math.hpp
│ │ │ ├── per_token_group_quant_8bit.cu
│ │ │ ├── per_token_group_quant_8bit_v2.cu
│ │ │ ├── per_token_quant_fp8.cu
│ │ │ ├── qserve_w4a8_per_chn_gemm.cu
│ │ │ └── qserve_w4a8_per_group_gemm.cu
│ │ ├── grammar/
│ │ │ └── apply_token_bitmask_inplace_cuda.cu
│ │ ├── kvcacheio/
│ │ │ └── transfer.cu
│ │ ├── mamba/
│ │ │ ├── causal_conv1d.cu
│ │ │ └── causal_conv1d.h
│ │ ├── memory/
│ │ │ └── weak_ref_tensor.cpp
│ │ ├── moe/
│ │ │ ├── cutlass_moe/
│ │ │ │ └── w4a8/
│ │ │ │ ├── scaled_mm_entry.cu
│ │ │ │ ├── w4a8_get_group_starts.cuh
│ │ │ │ ├── w4a8_grouped_mm_c3x.cu
│ │ │ │ ├── w4a8_grouped_mm_c3x.cuh
│ │ │ │ └── w4a8_moe_data.cu
│ │ │ ├── cutlass_moe_helper.cu
│ │ │ ├── fp8_blockwise_moe_kernel.cu
│ │ │ ├── fused_qknorm_rope_kernel.cu
│ │ │ ├── kimi_k2_moe_fused_gate.cu
│ │ │ ├── moe_align_kernel.cu
│ │ │ ├── moe_fused_gate.cu
│ │ │ ├── moe_sum.cu
│ │ │ ├── moe_sum_reduce.cu
│ │ │ ├── moe_topk_sigmoid_kernels.cu
│ │ │ ├── moe_topk_softmax_kernels.cu
│ │ │ └── prepare_moe_input.cu
│ │ ├── quantization/
│ │ │ └── gguf/
│ │ │ ├── dequantize.cuh
│ │ │ ├── ggml-common.h
│ │ │ ├── gguf_kernel.cu
│ │ │ ├── mmq.cuh
│ │ │ ├── mmvq.cuh
│ │ │ ├── moe.cuh
│ │ │ ├── moe_vec.cuh
│ │ │ └── vecdotq.cuh
│ │ ├── spatial/
│ │ │ ├── cuda_utils.h
│ │ │ ├── greenctx_stream.cu
│ │ │ └── greenctx_stream.h
│ │ ├── spatial_extension.cc
│ │ └── speculative/
│ │ ├── eagle_utils.cu
│ │ ├── ngram_utils.cu
│ │ ├── packbit.cu
│ │ ├── speculative_sampling.cu
│ │ └── speculative_sampling.cuh
│ ├── include/
│ │ ├── hip/
│ │ │ ├── hip_act_and_mul.cuh
│ │ │ ├── hip_math_def.h
│ │ │ ├── hip_vec_dtypes.h
│ │ │ └── impl/
│ │ │ ├── hip_vec_bf16_impl.h
│ │ │ ├── hip_vec_fp32_impl.h
│ │ │ └── hip_vec_half_impl.h
│ │ ├── pytorch_extension_utils_rocm.h
│ │ ├── scalar_type.hpp
│ │ ├── sgl_flash_kernel_ops.h
│ │ ├── sgl_kernel_ops.h
│ │ ├── sgl_kernel_torch_shim.h
│ │ └── utils.h
│ ├── kernel-runner-setup.sh
│ ├── pyproject.toml
│ ├── pyproject_cpu.toml
│ ├── pyproject_musa.toml
│ ├── pyproject_rocm.toml
│ ├── python/
│ │ └── sgl_kernel/
│ │ ├── __init__.py
│ │ ├── _fa4_interface.py
│ │ ├── allreduce.py
│ │ ├── attention.py
│ │ ├── cutlass_moe.py
│ │ ├── elementwise.py
│ │ ├── expert_specialization.py
│ │ ├── flash_attn.py
│ │ ├── flash_mla.py
│ │ ├── gemm.py
│ │ ├── grammar.py
│ │ ├── kvcacheio.py
│ │ ├── load_utils.py
│ │ ├── mamba.py
│ │ ├── memory.py
│ │ ├── moe.py
│ │ ├── quantization/
│ │ │ ├── __init__.py
│ │ │ └── gguf.py
│ │ ├── sampling.py
│ │ ├── scalar_type.py
│ │ ├── sparse_flash_attn.py
│ │ ├── spatial.py
│ │ ├── speculative.py
│ │ ├── test_utils.py
│ │ ├── testing/
│ │ │ ├── __init__.py
│ │ │ └── rotary_embedding.py
│ │ ├── top_k.py
│ │ ├── utils.py
│ │ └── version.py
│ ├── rename_wheels.sh
│ ├── setup_musa.py
│ ├── setup_rocm.py
│ └── tests/
│ ├── conftest.py
│ ├── spatial/
│ │ └── test_greenctx_stream.py
│ ├── speculative/
│ │ ├── test_eagle_utils.py
│ │ ├── test_ngram_utils.py
│ │ └── test_speculative_sampling.py
│ ├── test_activation.py
│ ├── test_amd_deterministic_custom_allreduce.py
│ ├── test_amd_nccl_allreduce_determinism.py
│ ├── test_apply_token_bitmask_inplace.py
│ ├── test_awq_dequant.py
│ ├── test_bmm_fp8.py
│ ├── test_causal_conv1d.py
│ ├── test_copy.py
│ ├── test_custom_allreduce.py
│ ├── test_cutlass_mla.py
│ ├── test_cutlass_w4a8_moe_mm.py
│ ├── test_dsv3_fused_a_gemm.py
│ ├── test_dsv3_router_gemm.py
│ ├── test_es_fp8_blockwise_moe.py
│ ├── test_es_mxfp8_blockscaled_moe.py
│ ├── test_flash_attention.py
│ ├── test_flash_attn_sparse.py
│ ├── test_flashmla.py
│ ├── test_fp8_blockwise_gemm.py
│ ├── test_fp8_blockwise_moe.py
│ ├── test_fp8_gemm.py
│ ├── test_fused_qk_norm_rope.py
│ ├── test_gguf.py
│ ├── test_gptq_kernel.py
│ ├── test_hadamard.py
│ ├── test_int8_gemm.py
│ ├── test_kimi_k2_moe_fused_gate.py
│ ├── test_kvcacheio.py
│ ├── test_merge_state.py
│ ├── test_merge_state_v2.py
│ ├── test_moe_align.py
│ ├── test_moe_fused_gate.py
│ ├── test_moe_topk_sigmoid.py
│ ├── test_moe_topk_softmax.py
│ ├── test_mscclpp.py
│ ├── test_norm.py
│ ├── test_per_token_group_quant_8bit.py
│ ├── test_per_token_quant_fp8.py
│ ├── test_qserve_w4a8_per_chn_gemm.py
│ ├── test_qserve_w4a8_per_group_gemm.py
│ ├── test_sampling.py
│ ├── test_topk.py
│ ├── test_torch_defaults_reset.py
│ └── utils.py
├── sgl-model-gateway/
│ ├── .cargo/
│ │ └── config.toml
│ ├── Cargo.toml
│ ├── Makefile
│ ├── README.md
│ ├── benches/
│ │ ├── consistent_hash_bench.rs
│ │ ├── manual_policy_benchmark.rs
│ │ ├── request_processing.rs
│ │ ├── router_registry_bench.rs
│ │ ├── tree_benchmark.rs
│ │ └── wasm_middleware_latency.rs
│ ├── bindings/
│ │ ├── golang/
│ │ │ ├── .gitignore
│ │ │ ├── Cargo.toml
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── client.go
│ │ │ ├── client_test.go
│ │ │ ├── examples/
│ │ │ │ ├── oai_server/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── config/
│ │ │ │ │ │ └── config.go
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ └── benchmark_result.md
│ │ │ │ │ ├── go.sum
│ │ │ │ │ ├── handlers/
│ │ │ │ │ │ ├── chat.go
│ │ │ │ │ │ ├── health.go
│ │ │ │ │ │ └── models.go
│ │ │ │ │ ├── logger/
│ │ │ │ │ │ └── logger.go
│ │ │ │ │ ├── main.go
│ │ │ │ │ ├── models/
│ │ │ │ │ │ └── chat.go
│ │ │ │ │ ├── run.sh
│ │ │ │ │ ├── scripts/
│ │ │ │ │ │ ├── analyze_tpot.sh
│ │ │ │ │ │ ├── pprof_analysis.sh
│ │ │ │ │ │ ├── pprof_quick.sh
│ │ │ │ │ │ ├── pprof_test.sh
│ │ │ │ │ │ └── profile_tpot.sh
│ │ │ │ │ ├── service/
│ │ │ │ │ │ └── sglang.go
│ │ │ │ │ └── utils/
│ │ │ │ │ └── utils.go
│ │ │ │ ├── simple/
│ │ │ │ │ ├── main.go
│ │ │ │ │ └── run.sh
│ │ │ │ └── streaming/
│ │ │ │ ├── main.go
│ │ │ │ └── run.sh
│ │ │ ├── go.sum
│ │ │ ├── integration_test.go
│ │ │ ├── internal/
│ │ │ │ ├── ffi/
│ │ │ │ │ ├── batch_postprocessor.go
│ │ │ │ │ ├── client.go
│ │ │ │ │ ├── grpc_converter.go
│ │ │ │ │ ├── postprocessor.go
│ │ │ │ │ └── preprocessor.go
│ │ │ │ ├── grpc/
│ │ │ │ │ └── client_grpc.go
│ │ │ │ └── proto/
│ │ │ │ ├── sglang_scheduler.pb.go
│ │ │ │ └── sglang_scheduler_grpc.pb.go
│ │ │ └── src/
│ │ │ ├── client.rs
│ │ │ ├── error.rs
│ │ │ ├── grpc_converter.rs
│ │ │ ├── lib.rs
│ │ │ ├── memory.rs
│ │ │ ├── postprocessor.rs
│ │ │ ├── preprocessor.rs
│ │ │ ├── stream.rs
│ │ │ ├── tokenizer.rs
│ │ │ ├── tool_parser.rs
│ │ │ └── utils.rs
│ │ └── python/
│ │ ├── .coveragerc
│ │ ├── Cargo.toml
│ │ ├── MANIFEST.in
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ ├── setup.py
│ │ ├── src/
│ │ │ ├── lib.rs
│ │ │ └── sglang_router/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── cli.py
│ │ │ ├── launch_router.py
│ │ │ ├── launch_server.py
│ │ │ ├── mini_lb.py
│ │ │ ├── router.py
│ │ │ ├── router_args.py
│ │ │ └── version.py
│ │ └── tests/
│ │ ├── conftest.py
│ │ ├── test_arg_parser.py
│ │ ├── test_router_config.py
│ │ ├── test_startup_sequence.py
│ │ └── test_validation.py
│ ├── build.rs
│ ├── e2e_test/
│ │ ├── __init__.py
│ │ ├── benchmarks/
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ ├── results.py
│ │ │ ├── summarize.py
│ │ │ ├── test_pd_perf.py
│ │ │ └── test_regular_perf.py
│ │ ├── chat_completions/
│ │ │ ├── __init__.py
│ │ │ ├── test_enable_thinking.py
│ │ │ ├── test_function_calling.py
│ │ │ ├── test_openai_server.py
│ │ │ ├── test_reasoning_content.py
│ │ │ └── test_validation.py
│ │ ├── conftest.py
│ │ ├── embeddings/
│ │ │ ├── __init__.py
│ │ │ ├── test_basic.py
│ │ │ └── test_correctness.py
│ │ ├── fixtures/
│ │ │ ├── __init__.py
│ │ │ ├── hooks.py
│ │ │ ├── markers.py
│ │ │ ├── pool.py
│ │ │ ├── ports.py
│ │ │ └── setup_backend.py
│ │ ├── infra/
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ ├── gateway.py
│ │ │ ├── gpu_allocator.py
│ │ │ ├── gpu_monitor.py
│ │ │ ├── model_pool.py
│ │ │ ├── model_specs.py
│ │ │ ├── process_utils.py
│ │ │ ├── run_eval.py
│ │ │ ├── simple_eval_common.py
│ │ │ └── simple_eval_mmlu.py
│ │ ├── pyproject.toml
│ │ ├── responses/
│ │ │ ├── __init__.py
│ │ │ ├── test_basic_crud.py
│ │ │ ├── test_state_management.py
│ │ │ ├── test_streaming_events.py
│ │ │ ├── test_structured_output.py
│ │ │ └── test_tools_call.py
│ │ └── router/
│ │ ├── __init__.py
│ │ ├── test_mmlu.py
│ │ ├── test_pd_mmlu.py
│ │ └── test_worker_api.py
│ ├── examples/
│ │ └── wasm/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── wasm-guest-auth/
│ │ │ ├── Cargo.toml
│ │ │ ├── README.md
│ │ │ ├── build.sh
│ │ │ └── src/
│ │ │ └── lib.rs
│ │ ├── wasm-guest-logging/
│ │ │ ├── Cargo.toml
│ │ │ ├── README.md
│ │ │ ├── build.sh
│ │ │ └── src/
│ │ │ └── lib.rs
│ │ └── wasm-guest-ratelimit/
│ │ ├── Cargo.toml
│ │ ├── README.md
│ │ ├── build.sh
│ │ └── src/
│ │ └── lib.rs
│ ├── pytest.ini
│ ├── rustfmt.toml
│ ├── scripts/
│ │ ├── generate_gateway_release_notes.sh
│ │ ├── generate_vision_golden.py
│ │ ├── run_benchmarks.py
│ │ └── setup-sccache.sh
│ ├── src/
│ │ ├── app_context.rs
│ │ ├── config/
│ │ │ ├── builder.rs
│ │ │ ├── mod.rs
│ │ │ ├── types.rs
│ │ │ └── validation.rs
│ │ ├── core/
│ │ │ ├── circuit_breaker.rs
│ │ │ ├── error.rs
│ │ │ ├── job_queue.rs
│ │ │ ├── metrics_aggregator.rs
│ │ │ ├── mod.rs
│ │ │ ├── model_card.rs
│ │ │ ├── model_type.rs
│ │ │ ├── retry.rs
│ │ │ ├── steps/
│ │ │ │ ├── mcp_registration.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── tokenizer_registration.rs
│ │ │ │ ├── wasm_module_registration.rs
│ │ │ │ ├── wasm_module_removal.rs
│ │ │ │ ├── worker/
│ │ │ │ │ ├── external/
│ │ │ │ │ │ ├── create_workers.rs
│ │ │ │ │ │ ├── discover_models.rs
│ │ │ │ │ │ └── mod.rs
│ │ │ │ │ ├── local/
│ │ │ │ │ │ ├── create_worker.rs
│ │ │ │ │ │ ├── detect_connection.rs
│ │ │ │ │ │ ├── discover_dp.rs
│ │ │ │ │ │ ├── discover_metadata.rs
│ │ │ │ │ │ ├── find_worker_to_update.rs
│ │ │ │ │ │ ├── find_workers_to_remove.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── remove_from_policy_registry.rs
│ │ │ │ │ │ ├── remove_from_worker_registry.rs
│ │ │ │ │ │ ├── submit_tokenizer_job.rs
│ │ │ │ │ │ ├── update_policies_for_worker.rs
│ │ │ │ │ │ ├── update_remaining_policies.rs
│ │ │ │ │ │ └── update_worker_properties.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ └── shared/
│ │ │ │ │ ├── activate.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── register.rs
│ │ │ │ │ └── update_policies.rs
│ │ │ │ ├── workflow_data.rs
│ │ │ │ └── workflow_engines.rs
│ │ │ ├── token_bucket.rs
│ │ │ ├── worker.rs
│ │ │ ├── worker_builder.rs
│ │ │ ├── worker_manager.rs
│ │ │ ├── worker_registry.rs
│ │ │ └── worker_service.rs
│ │ ├── lib.rs
│ │ ├── main.rs
│ │ ├── middleware.rs
│ │ ├── observability/
│ │ │ ├── events.rs
│ │ │ ├── gauge_histogram.rs
│ │ │ ├── inflight_tracker.rs
│ │ │ ├── logging.rs
│ │ │ ├── metrics.rs
│ │ │ ├── mod.rs
│ │ │ └── otel_trace.rs
│ │ ├── policies/
│ │ │ ├── bucket.rs
│ │ │ ├── cache_aware.rs
│ │ │ ├── consistent_hashing.rs
│ │ │ ├── factory.rs
│ │ │ ├── manual.rs
│ │ │ ├── mod.rs
│ │ │ ├── power_of_two.rs
│ │ │ ├── prefix_hash.rs
│ │ │ ├── random.rs
│ │ │ ├── registry.rs
│ │ │ ├── round_robin.rs
│ │ │ ├── tree.rs
│ │ │ └── utils.rs
│ │ ├── routers/
│ │ │ ├── conversations/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── error.rs
│ │ │ ├── factory.rs
│ │ │ ├── grpc/
│ │ │ │ ├── client.rs
│ │ │ │ ├── common/
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── response_collection.rs
│ │ │ │ │ ├── response_formatting.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── context.rs
│ │ │ │ │ │ ├── handlers.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── streaming.rs
│ │ │ │ │ │ └── utils.rs
│ │ │ │ │ └── stages/
│ │ │ │ │ ├── client_acquisition.rs
│ │ │ │ │ ├── dispatch_metadata.rs
│ │ │ │ │ ├── helpers.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── request_execution.rs
│ │ │ │ │ └── worker_selection.rs
│ │ │ │ ├── context.rs
│ │ │ │ ├── harmony/
│ │ │ │ │ ├── builder.rs
│ │ │ │ │ ├── detector.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── parser.rs
│ │ │ │ │ ├── processor.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── common.rs
│ │ │ │ │ │ ├── execution.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ │ └── streaming.rs
│ │ │ │ │ ├── stages/
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ ├── streaming.rs
│ │ │ │ │ └── types.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── pd_router.rs
│ │ │ │ ├── pipeline.rs
│ │ │ │ ├── proto_wrapper.rs
│ │ │ │ ├── regular/
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── processor.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── common.rs
│ │ │ │ │ │ ├── conversions.rs
│ │ │ │ │ │ ├── handlers.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ │ └── streaming.rs
│ │ │ │ │ ├── stages/
│ │ │ │ │ │ ├── chat/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── classify/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── embedding/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── generate/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ └── streaming.rs
│ │ │ │ ├── router.rs
│ │ │ │ └── utils.rs
│ │ │ ├── header_utils.rs
│ │ │ ├── http/
│ │ │ │ ├── mod.rs
│ │ │ │ ├── pd_router.rs
│ │ │ │ ├── pd_types.rs
│ │ │ │ └── router.rs
│ │ │ ├── mcp_utils.rs
│ │ │ ├── mesh/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── mod.rs
│ │ │ ├── openai/
│ │ │ │ ├── context.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── provider.rs
│ │ │ │ ├── responses/
│ │ │ │ │ ├── accumulator.rs
│ │ │ │ │ ├── common.rs
│ │ │ │ │ ├── mcp.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ ├── streaming.rs
│ │ │ │ │ ├── tool_handler.rs
│ │ │ │ │ └── utils.rs
│ │ │ │ └── router.rs
│ │ │ ├── parse/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── persistence_utils.rs
│ │ │ ├── router_manager.rs
│ │ │ └── tokenize/
│ │ │ ├── handlers.rs
│ │ │ └── mod.rs
│ │ ├── server.rs
│ │ ├── service_discovery.rs
│ │ ├── version.rs
│ │ └── wasm/
│ │ ├── mod.rs
│ │ └── route.rs
│ └── tests/
│ ├── api/
│ │ ├── api_endpoints_test.rs
│ │ ├── mod.rs
│ │ ├── parser_endpoints_test.rs
│ │ ├── request_formats_test.rs
│ │ ├── responses_api_test.rs
│ │ └── streaming_tests.rs
│ ├── api_tests.rs
│ ├── common/
│ │ ├── mock_mcp_server.rs
│ │ ├── mock_openai_server.rs
│ │ ├── mock_worker.rs
│ │ ├── mod.rs
│ │ ├── redis_test_server.rs
│ │ ├── streaming_helpers.rs
│ │ ├── test_app.rs
│ │ ├── test_certs.rs
│ │ ├── test_config.rs
│ │ └── tls_mock_worker.rs
│ ├── inflight_tracker_test.rs
│ ├── load_guard_raii_test.rs
│ ├── mcp_test.rs
│ ├── metrics_aggregator_test.rs
│ ├── otel_tracing_test.rs
│ ├── reliability/
│ │ ├── circuit_breaker_test.rs
│ │ ├── fault_tolerance_test.rs
│ │ ├── mod.rs
│ │ ├── rate_limiting_test.rs
│ │ └── retries_test.rs
│ ├── reliability_tests.rs
│ ├── routing/
│ │ ├── cache_aware_backward_compat_test.rs
│ │ ├── header_forwarding_test.rs
│ │ ├── load_balancing_test.rs
│ │ ├── manual_routing_test.rs
│ │ ├── mod.rs
│ │ ├── payload_size_test.rs
│ │ ├── pd_routing_test.rs
│ │ ├── policy_registry_integration.rs
│ │ ├── power_of_two_test.rs
│ │ ├── service_discovery_test.rs
│ │ ├── test_openai_routing.rs
│ │ ├── test_pd_routing.rs
│ │ └── worker_management_test.rs
│ ├── routing_tests.rs
│ ├── security/
│ │ ├── auth_integration_test.rs
│ │ ├── auth_test.rs
│ │ ├── mod.rs
│ │ └── mtls_test.rs
│ ├── security_tests.rs
│ ├── spec/
│ │ ├── chat_completion.rs
│ │ ├── chat_message.rs
│ │ ├── embedding.rs
│ │ ├── mod.rs
│ │ ├── rerank.rs
│ │ └── responses.rs
│ ├── spec_test.rs
│ └── wasm_test.rs
└── test/
├── README.md
├── lm_eval_configs/
│ ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
│ ├── NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
│ └── Qwen3.5-397B-A17B.yaml
├── manual/
│ ├── ascend/
│ │ ├── test_ascend_deepseek_mtp.py
│ │ ├── test_ascend_w8a8_quantization.py
│ │ └── test_mindspore_models.py
│ ├── cpu/
│ │ └── test_comm.py
│ ├── debug_utils/
│ │ └── test_log_parser.py
│ ├── entrypoints/
│ │ └── http_server/
│ │ └── test_abort_request.py
│ ├── ep/
│ │ ├── test_deepep_internode.py
│ │ ├── test_deepep_intranode.py
│ │ ├── test_deepep_low_latency.py
│ │ ├── test_eplb.py
│ │ ├── test_moe_deepep.py
│ │ ├── test_moe_deepep_eval_accuracy_large.py
│ │ ├── test_mooncake_expert_backup.py
│ │ └── test_nixl_ep.py
│ ├── hicache/
│ │ ├── test_disaggregation_hicache.py
│ │ └── test_pp_with_hicache.py
│ ├── kv_transfer/
│ │ └── test_mooncake_transfer_engine.py
│ ├── lang_frontend/
│ │ ├── test_bind_cache.py
│ │ ├── test_choices.py
│ │ ├── test_jump_forward.py
│ │ ├── test_openai_backend.py
│ │ ├── test_separate_reasoning.py
│ │ └── test_separate_reasoning_execution.py
│ ├── layers/
│ │ ├── attention/
│ │ │ └── nsa/
│ │ │ ├── test_act_quant_triton.py
│ │ │ ├── test_get_k_scale_triton_kernel.py
│ │ │ └── test_index_buf_accessor.py
│ │ └── moe/
│ │ ├── test_moe_runners_1gpu.py
│ │ └── test_moe_runners_4gpu.py
│ ├── lora/
│ │ ├── test_lora_cuda_graph.py
│ │ ├── test_lora_llama4.py
│ │ ├── test_lora_ops.py
│ │ ├── test_lora_qwen3_vl.py
│ │ ├── test_lora_spec_decoding.py
│ │ └── test_torch_backend.py
│ ├── models/
│ │ ├── test_clip_models.py
│ │ ├── test_falcon_h1_models.py
│ │ ├── test_gme_qwen_models.py
│ │ ├── test_grok_models.py
│ │ ├── test_kimi_k2_models.py
│ │ ├── test_llama4_models.py
│ │ ├── test_mistral_large3_basic.py
│ │ ├── test_mtp_models.py
│ │ └── test_unsloth_models.py
│ ├── nightly/
│ │ ├── test_deepseek_v31_perf.py
│ │ ├── test_deepseek_v32_perf.py
│ │ ├── test_text_models_gsm8k_eval.py
│ │ ├── test_text_models_perf.py
│ │ ├── test_vlms_mmmu_eval.py
│ │ ├── test_vlms_perf.py
│ │ ├── test_vlms_piecewise_cuda_graph.py
│ │ ├── test_vlms_vit_cuda_graph.py
│ │ └── test_vlms_vit_flashinfer_cudnn.py
│ ├── openai_server/
│ │ └── features/
│ │ ├── test_cache_report.py
│ │ ├── test_continuous_usage_stats.py
│ │ └── test_structural_tag.py
│ ├── piecewise_cudagraph/
│ │ └── test_disaggregation_piecewise_cuda_graph.py
│ ├── quant/
│ │ └── test_fp8_kvcache.py
│ ├── test_async_dynamic_batch_tokenizer.py
│ ├── test_async_mm_data_processor.py
│ ├── test_config_integration.py
│ ├── test_custom_allreduce.py
│ ├── test_deepseek_chat_templates.py
│ ├── test_double_sparsity.py
│ ├── test_expert_distribution.py
│ ├── test_expert_location_updater.py
│ ├── test_fim_completion.py
│ ├── test_forward_split_prefill.py
│ ├── test_get_weights_by_name.py
│ ├── test_health_check.py
│ ├── test_kv_events.py
│ ├── test_logprobs.py
│ ├── test_mla_tp.py
│ ├── test_modelopt.py
│ ├── test_modelopt_fp8kvcache.py
│ ├── test_models_from_modelscope.py
│ ├── test_mori_transfer_engine_e2e.py
│ ├── test_mscclpp.py
│ ├── test_quick_allreduce.py
│ ├── test_ray_engine.py
│ ├── test_sagemaker_server.py
│ ├── test_schedule_policy.py
│ ├── test_srt_engine_with_quant_args.py
│ ├── test_tokenizer_batch_encode.py
│ ├── test_tokenizer_manager.py
│ ├── test_torch_flex_attention_backend.py
│ ├── test_torch_tp.py
│ ├── test_tracing.py
│ ├── test_triton_attention_rocm_mla.py
│ ├── test_triton_moe_wna16.py
│ ├── test_trtllm_fp8_kv_kernel.py
│ ├── test_two_batch_overlap.py
│ ├── test_vertex_endpoint.py
│ ├── test_vlm_accuracy.py
│ ├── test_wave_attention_backend.py
│ ├── test_weight_validation.py
│ ├── test_weight_version.py
│ └── vlm/
│ └── test_anthropic_vision.py
├── pytest.ini
├── registered/
│ ├── 4-gpu-models/
│ │ ├── test_deepseek_v3_cutedsl_4gpu.py
│ │ ├── test_gpt_oss_4gpu.py
│ │ ├── test_nvidia_nemotron_3_super_nvfp4.py
│ │ ├── test_qwen35_models.py
│ │ ├── test_qwen3_next_models.py
│ │ └── test_qwen3_next_models_mtp.py
│ ├── 8-gpu-models/
│ │ ├── test_deepseek_v31.py
│ │ ├── test_deepseek_v32.py
│ │ ├── test_deepseek_v32_basic.py
│ │ ├── test_deepseek_v32_cp_single_node.py
│ │ ├── test_deepseek_v32_mtp.py
│ │ ├── test_deepseek_v3_basic.py
│ │ ├── test_deepseek_v3_mtp.py
│ │ ├── test_glm_46.py
│ │ ├── test_glm_46_fp8.py
│ │ ├── test_gpt_oss_120b.py
│ │ ├── test_kimi_k25.py
│ │ ├── test_llama4.py
│ │ ├── test_mimo_models.py
│ │ ├── test_minimax_m25.py
│ │ ├── test_mistral_large3.py
│ │ ├── test_nvidia_nemotron_3_super_bf16.py
│ │ ├── test_nvidia_nemotron_3_super_nightly.py
│ │ ├── test_qwen35.py
│ │ ├── test_qwen3_235b.py
│ │ └── test_ring_2_5_1t.py
│ ├── README.md
│ ├── amd/
│ │ ├── accuracy/
│ │ │ ├── mi30x/
│ │ │ │ ├── test_deepseek_r1_eval_amd.py
│ │ │ │ ├── test_deepseek_v31_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_dp_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_mtp_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_tc_eval_amd.py
│ │ │ │ ├── test_glm5_eval_amd.py
│ │ │ │ ├── test_gpt_oss_eval_amd.py
│ │ │ │ ├── test_grok1_fp8_eval_amd.py
│ │ │ │ ├── test_grok1_int4_eval_amd.py
│ │ │ │ ├── test_grok2_eval_amd.py
│ │ │ │ ├── test_grok_eval_amd.py
│ │ │ │ ├── test_gsm8k_eval_amd.py
│ │ │ │ ├── test_kimi_k25_eval_amd.py
│ │ │ │ ├── test_kimi_k2_eval_amd.py
│ │ │ │ ├── test_minimax_m25_eval_amd.py
│ │ │ │ ├── test_qwen35_eval_amd.py
│ │ │ │ └── test_vlms_mmmu_eval_amd.py
│ │ │ └── mi35x/
│ │ │ ├── test_deepseek_r1_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_dp_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_mtp_eval_mi35x.py
│ │ │ ├── test_glm5_eval_mi35x.py
│ │ │ ├── test_gpt_oss_eval_mi35x.py
│ │ │ ├── test_grok1_int4_eval_mi35x.py
│ │ │ ├── test_grok2_eval_mi35x.py
│ │ │ ├── test_kimi_k25_aiter_mla_eval_mi35x.py
│ │ │ ├── test_kimi_k25_eval_mi35x.py
│ │ │ ├── test_kimi_k25_mxfp4_eval_mi35x.py
│ │ │ ├── test_kimi_k2_eval_mi35x.py
│ │ │ ├── test_minimax_m25_eval_mi35x.py
│ │ │ ├── test_qwen35_eval_mi35x.py
│ │ │ └── test_qwen3_coder_next_eval_mi35x.py
│ │ ├── disaggregation/
│ │ │ ├── test_disaggregation_basic.py
│ │ │ └── test_disaggregation_pp.py
│ │ ├── perf/
│ │ │ ├── mi30x/
│ │ │ │ ├── test_deepseek_v31_perf.py
│ │ │ │ ├── test_deepseek_v32_basic_perf_amd.py
│ │ │ │ ├── test_deepseek_v32_mtp_perf_amd.py
│ │ │ │ ├── test_deepseek_v3_perf.py
│ │ │ │ ├── test_grok1_fp8_perf.py
│ │ │ │ ├── test_grok1_int4_perf.py
│ │ │ │ ├── test_grok2_perf.py
│ │ │ │ ├── test_text_models_perf_amd.py
│ │ │ │ └── test_vlms_perf_amd.py
│ │ │ └── mi35x/
│ │ │ ├── test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_perf_mi35x.py
│ │ │ ├── test_deepseek_v32_basic_perf_mi35x.py
│ │ │ ├── test_deepseek_v32_mtp_perf_mi35x.py
│ │ │ ├── test_grok1_int4_perf_mi35x.py
│ │ │ └── test_grok2_perf_mi35x.py
│ │ ├── test_deepseek_r1_mxfp4_8gpu.py
│ │ ├── test_deepseek_v32_basic.py
│ │ ├── test_deepseek_v32_mtp.py
│ │ ├── test_deepseek_v3_basic.py
│ │ ├── test_deepseek_v3_basic_kv_fp8.py
│ │ ├── test_deepseek_v3_mtp.py
│ │ ├── test_deepseek_v3_mtp_kv_fp8.py
│ │ ├── test_kimi_k25_mxfp4.py
│ │ ├── test_kimi_k2_instruct.py
│ │ ├── test_moriep_small.py
│ │ ├── test_qwen3_coder_next_8gpu.py
│ │ ├── test_qwen3_instruct.py
│ │ ├── test_qwen3_instruct_fp8.py
│ │ ├── test_qwen3_instruct_mxfp4.py
│ │ └── test_zimage_turbo.py
│ ├── ascend/
│ │ ├── basic_function/
│ │ │ ├── HiCache/
│ │ │ │ ├── test_npu_hierarchical_cache.py
│ │ │ │ ├── test_npu_hierarchical_cache_mla.py
│ │ │ │ ├── test_npu_hierarchical_cache_mutually_exclusive.py
│ │ │ │ ├── test_npu_hierarchical_cache_ttft_mha.py
│ │ │ │ └── test_npu_radix_cache.py
│ │ │ ├── parallel_strategy/
│ │ │ │ └── expert_parallelism/
│ │ │ │ ├── test_npu_deepep_auto_deepseek_v3_2_w8a8.py
│ │ │ │ ├── test_npu_deepep_auto_qwen3_480b.py
│ │ │ │ ├── test_npu_deepep_auto_qwen3_next.py
│ │ │ │ ├── test_npu_deepep_low_latency_deepseek_v3_2_w8a8.py
│ │ │ │ ├── test_npu_deepep_low_latency_qwen3_480b.py
│ │ │ │ └── test_npu_deepep_low_latency_qwen3_next.py
│ │ │ ├── parameter/
│ │ │ │ ├── deepseek_coder.json
│ │ │ │ ├── test_npu_fim_completion.py
│ │ │ │ ├── test_npu_log_level.py
│ │ │ │ ├── test_npu_no_chunked_prefill.py
│ │ │ │ ├── test_npu_no_overlap_scheduler.py
│ │ │ │ ├── test_npu_original_logprobs.py
│ │ │ │ └── test_npu_warmups.py
│ │ │ └── speculative_inference/
│ │ │ └── test_npu_eagle3.py
│ │ ├── embedding_models/
│ │ │ └── test_npu_bge_large_en_v1_5.py
│ │ ├── interface/
│ │ │ ├── test_npu_api.py
│ │ │ ├── test_npu_api_abort_request.py
│ │ │ ├── test_npu_api_encode.py
│ │ │ ├── test_npu_enable_thinking.py
│ │ │ ├── test_npu_matched_stop.py
│ │ │ ├── test_npu_openai_function_calling.py
│ │ │ ├── test_npu_openai_server_ignore_eos.py
│ │ │ └── test_npu_penalty.py
│ │ ├── llm_models/
│ │ │ ├── test_npu_afm_4_5b.py
│ │ │ ├── test_npu_baichuan2_13b_chat.py
│ │ │ ├── test_npu_c4ai_command_r_v01.py
│ │ │ ├── test_npu_chatglm2_6b.py
│ │ │ ├── test_npu_deepseek_v3_2_exp_w8a8.py
│ │ │ ├── test_npu_exaone_3.py
│ │ │ ├── test_npu_gemma_3_4b_it_llm.py
│ │ │ ├── test_npu_glm4_9b_chat.py
│ │ │ ├── test_npu_granite_3_0_3b_a800m.py
│ │ │ ├── test_npu_granite_3_1_8b.py
│ │ │ ├── test_npu_grok_2.py
│ │ │ ├── test_npu_internlm2_7b.py
│ │ │ ├── test_npu_ling_lite.py
│ │ │ ├── test_npu_llama4_scount_17b_16e.py
│ │ │ ├── test_npu_llama_2_7b.py
│ │ │ ├── test_npu_mimo_7b_rl.py
│ │ │ ├── test_npu_minicpm3_4b.py
│ │ │ ├── test_npu_mistral_7b.py
│ │ │ ├── test_npu_persimmon_8b_chat.py
│ │ │ ├── test_npu_phi_4_multimodal_llm.py
│ │ │ ├── test_npu_qwen3_0_6b.py
│ │ │ ├── test_npu_qwen3_1_7b_gptq_int8.py
│ │ │ ├── test_npu_qwen3_235b_a22b_w8a8.py
│ │ │ ├── test_npu_qwen3_30b.py
│ │ │ ├── test_npu_qwen3_30b_w4a4.py
│ │ │ ├── test_npu_qwen3_32b.py
│ │ │ ├── test_npu_qwen3_coder_480b_a35b.py
│ │ │ ├── test_npu_qwq_32b_w8a8.py
│ │ │ ├── test_npu_smollm_1_7b.py
│ │ │ ├── test_npu_stablelm_2_1_6b.py
│ │ │ └── tool_chat_template_c4ai_command_r_v01.jinja
│ │ ├── rerank_models/
│ │ │ └── test_npu_bge_reranker_v2_m3.py
│ │ ├── reward_models/
│ │ │ ├── test_npu_gemma_2_27b_v0_2.py
│ │ │ ├── test_npu_internlm2_7b_reward.py
│ │ │ └── test_npu_llama_3_1_8b_v0_2.py
│ │ ├── test_npu_memory_consumption.py
│ │ └── vlm_models/
│ │ ├── mmmu-val.yaml
│ │ ├── test_npu_deepseek_vl2.py
│ │ ├── test_npu_gemma_3_4b_it.py
│ │ ├── test_npu_janus_pro_1b.py
│ │ ├── test_npu_janus_pro_7b.py
│ │ ├── test_npu_kimi_vl_a3b_instruct.py
│ │ ├── test_npu_llama_3_2_11b_vision_instruct.py
│ │ ├── test_npu_mimo_vl_7b_rl.py
│ │ ├── test_npu_minicpm_o_2_6.py
│ │ ├── test_npu_minicpm_v_2_6.py
│ │ ├── test_npu_mistral_small_3_1_24b_instruct_2503.py
│ │ ├── test_npu_phi4_multimodal_instruct.py
│ │ ├── test_npu_qwen2_5_vl_3b_instruct.py
│ │ ├── test_npu_qwen2_5_vl_72b_instruct.py
│ │ ├── test_npu_qwen3_vl_235b_a22b_instruct.py
│ │ ├── test_npu_qwen3_vl_30b_a3b_instruct.py
│ │ ├── test_npu_qwen3_vl_4b_instruct.py
│ │ └── test_npu_qwen3_vl_8b_instruct.py
│ ├── attention/
│ │ ├── test_chunk_gated_delta_rule.py
│ │ ├── test_create_kvindices.py
│ │ ├── test_fa3.py
│ │ ├── test_flash_attention_4.py
│ │ ├── test_hybrid_attn_backend.py
│ │ ├── test_kda_kernels.py
│ │ ├── test_local_attn.py
│ │ ├── test_torch_native_attention_backend.py
│ │ ├── test_triton_attention_backend.py
│ │ ├── test_triton_attention_kernels.py
│ │ ├── test_triton_sliding_window.py
│ │ └── test_wave_attention_kernels.py
│ ├── backends/
│ │ ├── test_deepseek_r1_fp8_trtllm_backend.py
│ │ ├── test_deepseek_v3_fp4_cutlass_moe.py
│ │ ├── test_flashinfer_trtllm_gen_attn_backend.py
│ │ ├── test_flashinfer_trtllm_gen_moe_backend.py
│ │ ├── test_qwen3_fp4_trtllm_gen_moe.py
│ │ └── test_torch_compile.py
│ ├── bench_fn/
│ │ ├── test_bench_serving_functionality.py
│ │ └── test_benchmark_datasets_api.py
│ ├── constrained_decoding/
│ │ └── test_constrained_decoding.py
│ ├── core/
│ │ ├── test_cpp_radix_cache.py
│ │ ├── test_deepseek_v3_deterministic.py
│ │ ├── test_deterministic.py
│ │ ├── test_gpt_oss_1gpu.py
│ │ ├── test_gpt_oss_sm120.py
│ │ ├── test_hidden_states.py
│ │ ├── test_page_size.py
│ │ ├── test_qwen3_next_deterministic.py
│ │ ├── test_request_queue_validation.py
│ │ ├── test_score_api.py
│ │ ├── test_srt_endpoint.py
│ │ └── test_srt_engine.py
│ ├── debug_utils/
│ │ ├── comparator/
│ │ │ ├── __init__.py
│ │ │ ├── aligner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── entrypoint/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ └── test_planner.py
│ │ │ │ ├── reorderer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ └── test_planner.py
│ │ │ │ ├── test_axis_aligner.py
│ │ │ │ ├── token_aligner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_aux_loader.py
│ │ │ │ │ ├── test_aux_plugins.py
│ │ │ │ │ ├── test_concat_steps.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ ├── test_planner.py
│ │ │ │ │ └── test_thd_seq_lens_loader.py
│ │ │ │ └── unsharder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_executor.py
│ │ │ │ ├── test_parallel_info.py
│ │ │ │ └── test_planner.py
│ │ │ ├── conftest.py
│ │ │ ├── dims_spec/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_dim_parser.py
│ │ │ │ ├── test_dims_parser.py
│ │ │ │ ├── test_tensor_naming.py
│ │ │ │ └── test_types.py
│ │ │ ├── tensor_comparator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_comparator.py
│ │ │ │ ├── test_formatter.py
│ │ │ │ └── test_types.py
│ │ │ ├── test_bundle_comparator.py
│ │ │ ├── test_bundle_matcher.py
│ │ │ ├── test_display.py
│ │ │ ├── test_dp_utils.py
│ │ │ ├── test_dump_loader.py
│ │ │ ├── test_entrypoint.py
│ │ │ ├── test_log_sink.py
│ │ │ ├── test_manually_verify.py
│ │ │ ├── test_meta_overrider.py
│ │ │ ├── test_model_validation.py
│ │ │ ├── test_output_types.py
│ │ │ ├── test_per_token_visualizer.py
│ │ │ ├── test_preset.py
│ │ │ ├── test_utils.py
│ │ │ ├── test_visualizer.py
│ │ │ └── testing_helpers.py
│ │ ├── source_patcher/
│ │ │ ├── conftest.py
│ │ │ ├── test_code_patcher.py
│ │ │ ├── test_dumper_integration.py
│ │ │ └── test_source_editor.py
│ │ ├── test_crash_dump.py
│ │ ├── test_cuda_coredump_smoke.py
│ │ ├── test_dump_comparator.py
│ │ ├── test_dump_loader.py
│ │ ├── test_dumper.py
│ │ ├── test_engine_dumper_comparator_e2e.py
│ │ ├── test_schedule_simulator.py
│ │ ├── test_soft_watchdog.py
│ │ └── test_tensor_dump_forward_hook.py
│ ├── disaggregation/
│ │ ├── test_disaggregation_basic.py
│ │ ├── test_disaggregation_decode_offload.py
│ │ └── test_specv2_kvcache_offloading.py
│ ├── distributed/
│ │ ├── test_data_parallelism.py
│ │ ├── test_disaggregation_aarch64.py
│ │ ├── test_disaggregation_different_tp.py
│ │ ├── test_disaggregation_dp_attention.py
│ │ ├── test_disaggregation_hybrid_attention.py
│ │ ├── test_disaggregation_pp.py
│ │ ├── test_dp_attention.py
│ │ ├── test_dp_attention_large.py
│ │ ├── test_epd_disaggregation.py
│ │ ├── test_load_weights_from_remote_instance.py
│ │ ├── test_load_weights_from_remote_instance_npu.py
│ │ ├── test_parallel_state.py
│ │ └── test_pp_single_node.py
│ ├── dllm/
│ │ ├── test_llada2_mini.py
│ │ └── test_llada2_mini_amd.py
│ ├── embedding/
│ │ ├── test_embedding_models.py
│ │ ├── test_encoder_embedding_models.py
│ │ ├── test_input_embeddings.py
│ │ ├── test_input_embeds_chunked.py
│ │ └── test_openai_embedding.py
│ ├── ep/
│ │ ├── test_deepep_large.py
│ │ ├── test_deepep_small.py
│ │ └── test_mooncake_ep_small.py
│ ├── eval/
│ │ ├── test_eval_accuracy_large.py
│ │ ├── test_moe_eval_accuracy_large.py
│ │ ├── test_text_models_gsm8k_eval.py
│ │ └── test_vlms_mmmu_eval.py
│ ├── function_call/
│ │ └── test_kimik2_detector.py
│ ├── hicache/
│ │ ├── test_hicache_storage.py
│ │ ├── test_hicache_storage_3fs_backend.py
│ │ ├── test_hicache_storage_file_backend.py
│ │ ├── test_hicache_storage_mooncake_backend.py
│ │ ├── test_hicache_storage_runtime_attach_detach.py
│ │ └── test_hicache_variants.py
│ ├── kernels/
│ │ ├── test_fp4_moe.py
│ │ ├── test_fused_topk_deepseek.py
│ │ └── test_nsa_indexer.py
│ ├── layers/
│ │ ├── mamba/
│ │ │ ├── conftest.py
│ │ │ ├── test_causal_conv1d.py
│ │ │ ├── test_mamba2_mixer.py
│ │ │ ├── test_mamba_ssm.py
│ │ │ └── test_mamba_ssm_ssd.py
│ │ └── test_fla_layernorm_guard.py
│ ├── lora/
│ │ ├── test_chunked_sgmv_backend.py
│ │ ├── test_embedding_lora_support.py
│ │ ├── test_fused_moe_lora_kernel.py
│ │ ├── test_lora_backend.py
│ │ ├── test_lora_eviction.py
│ │ ├── test_lora_eviction_policy.py
│ │ ├── test_lora_hf_sgl_logprob_diff.py
│ │ ├── test_lora_openai_api.py
│ │ ├── test_lora_openai_compatible.py
│ │ ├── test_lora_overlap_loading.py
│ │ ├── test_lora_qwen3.py
│ │ ├── test_lora_radix_cache.py
│ │ ├── test_lora_tied_lm_head.py
│ │ ├── test_lora_tp.py
│ │ ├── test_lora_update.py
│ │ └── test_multi_lora_backend.py
│ ├── metrics/
│ │ ├── test_metrics.py
│ │ └── test_priority_metrics.py
│ ├── mla/
│ │ ├── test_flashmla.py
│ │ ├── test_mla.py
│ │ ├── test_mla_deepseek_v3.py
│ │ ├── test_mla_flashinfer.py
│ │ ├── test_mla_fp8.py
│ │ └── test_mla_int8_deepseek_v3.py
│ ├── model_loading/
│ │ ├── test_external_models.py
│ │ └── test_utils_update_weights.py
│ ├── models/
│ │ ├── test_compressed_tensors_models.py
│ │ ├── test_cross_encoder_models.py
│ │ ├── test_dummy_grok_models.py
│ │ ├── test_generation_models.py
│ │ ├── test_gpt_oss_models_pcg.py
│ │ ├── test_kimi_linear_models.py
│ │ ├── test_kimi_linear_models_pcg.py
│ │ ├── test_ministral3_models.py
│ │ ├── test_nvidia_nemotron_3_nano.py
│ │ ├── test_nvidia_nemotron_nano_v2.py
│ │ ├── test_nvidia_nemotron_nano_v2_vl.py
│ │ ├── test_qwen3_next_models_fp4.py
│ │ ├── test_qwen3_next_models_pcg.py
│ │ ├── test_qwen_models.py
│ │ ├── test_reward_models.py
│ │ ├── test_transformers_models.py
│ │ └── test_vlm_models.py
│ ├── moe/
│ │ ├── test_cutedsl_moe.py
│ │ ├── test_fused_moe.py
│ │ ├── test_glm4_moe_models.py
│ │ ├── test_moe_ep.py
│ │ ├── test_torch_compile_moe.py
│ │ ├── test_triton_fused_moe.py
│ │ └── test_triton_moe_channel_fp8_kernel.py
│ ├── openai_server/
│ │ ├── basic/
│ │ │ ├── test_anthropic_server.py
│ │ │ ├── test_openai_server.py
│ │ │ ├── test_protocol.py
│ │ │ ├── test_serving_chat.py
│ │ │ ├── test_serving_completions.py
│ │ │ └── test_serving_rerank.py
│ │ ├── features/
│ │ │ ├── test_enable_thinking.py
│ │ │ ├── test_json_mode.py
│ │ │ ├── test_openai_server_ebnf.py
│ │ │ ├── test_openai_server_hidden_states.py
│ │ │ └── test_reasoning_content.py
│ │ ├── function_call/
│ │ │ ├── test_anthropic_tool_use.py
│ │ │ ├── test_openai_function_calling.py
│ │ │ └── test_tool_choice.py
│ │ └── validation/
│ │ ├── test_large_max_new_tokens.py
│ │ ├── test_matched_stop.py
│ │ ├── test_openai_server_ignore_eos.py
│ │ └── test_request_length_validation.py
│ ├── ops/
│ │ ├── test_aiter_allreduce_fusion_amd.py
│ │ └── test_repeat_interleave.py
│ ├── perf/
│ │ ├── test_bench_one_batch_1gpu.py
│ │ ├── test_bench_one_batch_2gpu.py
│ │ ├── test_bench_serving_1gpu_large.py
│ │ ├── test_bench_serving_1gpu_part1.py
│ │ ├── test_bench_serving_1gpu_part2.py
│ │ ├── test_bench_serving_2gpu.py
│ │ ├── test_dpsk_r1_fp4_4gpu_perf.py
│ │ ├── test_gpt_oss_4gpu_perf.py
│ │ ├── test_text_models_perf.py
│ │ ├── test_vlm_perf_5090.py
│ │ └── test_vlms_perf.py
│ ├── piecewise_cuda_graph/
│ │ └── test_piecewise_cuda_graph_support_1_gpu.py
│ ├── profiling/
│ │ ├── test_profile_v2.py
│ │ └── test_start_profile.py
│ ├── quant/
│ │ ├── test_autoround.py
│ │ ├── test_awq.py
│ │ ├── test_awq_dequant.py
│ │ ├── test_block_int8.py
│ │ ├── test_bnb.py
│ │ ├── test_deepseek_v32_fp4_4gpu.py
│ │ ├── test_deepseek_v32_fp4_mtp_4gpu.py
│ │ ├── test_deepseek_v3_fp4_4gpu.py
│ │ ├── test_eval_fp8_accuracy.py
│ │ ├── test_fp8_blockwise_gemm.py
│ │ ├── test_fp8_kernel.py
│ │ ├── test_fp8_utils.py
│ │ ├── test_fp8kv_triton.py
│ │ ├── test_fused_rms_fp8_group_quant.py
│ │ ├── test_gguf.py
│ │ ├── test_gptqmodel_dynamic.py
│ │ ├── test_int4fp8_moe.py
│ │ ├── test_int8_kernel.py
│ │ ├── test_marlin_moe.py
│ │ ├── test_modelopt_fp8.py
│ │ ├── test_nvfp4_gemm.py
│ │ ├── test_quant_config_parsing.py
│ │ ├── test_quantization.py
│ │ ├── test_torchao.py
│ │ ├── test_triton_scaled_mm.py
│ │ ├── test_w4a8_deepseek_v3.py
│ │ └── test_w8a8_quantization.py
│ ├── radix_cache/
│ │ ├── test_radix_attention.py
│ │ ├── test_radix_cache_hit.py
│ │ └── test_swa_radix_cache_kl.py
│ ├── rl/
│ │ ├── test_fp32_lm_head.py
│ │ ├── test_lora_load_from_tensor.py
│ │ ├── test_multi_instance_release_memory_occupation.py
│ │ ├── test_patch_torch.py
│ │ ├── test_release_memory_occupation.py
│ │ ├── test_return_routed_experts.py
│ │ ├── test_update_weights_from_disk.py
│ │ ├── test_update_weights_from_distributed.py
│ │ └── test_update_weights_from_tensor.py
│ ├── rotary/
│ │ ├── test_mrope.py
│ │ └── test_rope_rocm.py
│ ├── sampling/
│ │ ├── test_original_logprobs.py
│ │ ├── test_penalty.py
│ │ └── test_pytorch_sampling_backend.py
│ ├── scheduler/
│ │ ├── test_abort.py
│ │ ├── test_chunked_prefill.py
│ │ ├── test_no_chunked_prefill.py
│ │ ├── test_no_overlap_scheduler.py
│ │ ├── test_prefill_delayer.py
│ │ ├── test_priority_scheduling.py
│ │ ├── test_retract_decode.py
│ │ └── test_routing_key_scheduling.py
│ ├── sessions/
│ │ ├── test_session_control.py
│ │ ├── test_session_latency.py
│ │ └── test_streaming_session.py
│ ├── spec/
│ │ ├── eagle/
│ │ │ ├── test_deepseek_v3_fp4_mtp_small.py
│ │ │ ├── test_eagle3_basic.py
│ │ │ ├── test_eagle_constrained_decoding.py
│ │ │ ├── test_eagle_dp_attention.py
│ │ │ ├── test_eagle_infer_a.py
│ │ │ ├── test_eagle_infer_b.py
│ │ │ ├── test_eagle_infer_beta.py
│ │ │ ├── test_eagle_infer_beta_dp_attention.py
│ │ │ └── test_eagle_infer_beta_dp_attention_large.py
│ │ ├── test_constrained_decoding_spec_reasoning.py
│ │ ├── test_ngram_speculative_decoding.py
│ │ ├── test_standalone_speculative_decoding.py
│ │ └── utils/
│ │ └── test_build_eagle_tree.py
│ ├── stress/
│ │ ├── test_stress_deepseek_v3.py
│ │ ├── test_stress_glm_4_6.py
│ │ ├── test_stress_kimi_k2.py
│ │ └── test_stress_qwen3_235b.py
│ ├── test_hybrid_dp_ep_tp_mtp.py
│ ├── test_srt_backend.py
│ ├── tokenizer/
│ │ ├── test_multi_tokenizer.py
│ │ └── test_skip_tokenizer_init.py
│ ├── unit/
│ │ ├── README.md
│ │ ├── batch_invariant_ops/
│ │ │ └── test_batch_invariant_ops.py
│ │ ├── entrypoints/
│ │ │ ├── openai/
│ │ │ │ └── test_serving_embedding.py
│ │ │ └── test_ssl_cert_refresher.py
│ │ ├── function_call/
│ │ │ ├── test_function_call_parser.py
│ │ │ ├── test_glm47_moe_detector.py
│ │ │ ├── test_json_schema_constraint.py
│ │ │ ├── test_parallel_tool_calls.py
│ │ │ └── test_unknown_tool_name.py
│ │ ├── layers/
│ │ │ ├── test_conv_layer.py
│ │ │ └── test_mamba_state_scatter_triton.py
│ │ ├── managers/
│ │ │ ├── test_io_struct.py
│ │ │ ├── test_prefill_adder.py
│ │ │ └── test_profile_merger_http_api.py
│ │ ├── mem_cache/
│ │ │ ├── test_evict_policy.py
│ │ │ ├── test_mamba_unittest.py
│ │ │ ├── test_nsa_pool_host_unit.py
│ │ │ ├── test_radix_cache_slru_accuracy.py
│ │ │ ├── test_radix_ca
================================================
FILE CONTENTS
================================================
================================================
FILE: .claude/skills/add-jit-kernel/SKILL.md
================================================
---
name: add-jit-kernel
description: Step-by-step tutorial for adding a new lightweight JIT CUDA kernel to sglang's jit_kernel module
---
# Tutorial: Adding a New JIT Kernel to SGLang
This tutorial walks through adding a simple element-wise scale operation as a JIT kernel. We'll implement `scale(x, factor) = x * factor` to demonstrate the complete workflow.
## Goal
Add a new operation that scales each element of a tensor by a scalar factor:
- Input: tensor `x` (CUDA) and scalar `factor` (float, passed at runtime)
- Output: `x * factor` (element-wise), allocated internally
- Supported dtypes: **FP16 (`torch.float16`), BF16 (`torch.bfloat16`), FP32 (`torch.float32`)**
## When to use JIT vs AOT (`sgl-kernel`)
- **JIT (`jit_kernel`)**: prefer this first for kernels that do **not** depend on CUTLASS or another large C++ project. It is the default choice for lightweight kernels that benefit from rapid iteration and first-use compilation.
- **AOT (`sgl-kernel`)**: prefer this when the kernel **does** depend on CUTLASS or another large C++ project, or when it should live in `sgl-kernel/` and participate in the wheel build / torch op registration flow.
- **Exception**: kernels that depend on `flashinfer`, or on CUTLASS that is already provided through `flashinfer`, can still be implemented as `jit_kernel`.
---
## Common Abstractions in `python/sglang/jit_kernel/include/sgl_kernel/`
**Always prefer these abstractions over raw CUDA primitives.** They provide safety, readability, and consistency with the rest of the codebase.
**Important include rule:** for every `#include <sgl_kernel/...>` line, add a short trailing comment explaining why that header is included (for example `// For TensorMatcher, SymbolicSize, SymbolicDevice`). This matches the current JIT kernel style and keeps include usage self-documenting.
### `utils.h` — Host-side utilities
```cpp
#include <sgl_kernel/utils.h>
```
- **`host::RuntimeCheck(cond, args...)`** — Assert a condition at runtime; throws `PanicError` with file/line info on failure. Prefer this over bare `assert`.
- **`host::Panic(args...)`** — Unconditionally throw a `PanicError` with a descriptive message.
- **`host::div_ceil(a, b)`** — Integer ceiling division `(a + b - 1) / b`.
- **`host::irange(n)`** / **`host::irange(start, end)`** — Range views for cleaner loops.
- **`host::pointer::offset(ptr, offsets...)`** — Byte-safe pointer arithmetic on `void*`. Use this instead of raw casts.
### `utils.cuh` — Device-side utilities + `LaunchKernel`
```cpp
#include <sgl_kernel/utils.cuh>
```
- **Type aliases**: `fp16_t`, `bf16_t`, `fp32_t`, `fp8_e4m3_t`, `fp8_e5m2_t` and their packed variants `fp16x2_t`, `bf16x2_t`, `fp32x2_t`, etc.
- **`SGL_DEVICE`** — Expands to `__forceinline__ __device__`. Use on all device functions.
- **`device::kWarpThreads`** — Constant `32`.
- **`device::load_as<T>(ptr, offset)`** / **`device::store_as<T>(ptr, val, offset)`** — Type-safe loads/stores from `void*`.
- **`device::pointer::offset(ptr, offsets...)`** — Pointer arithmetic on device.
- **`host::LaunchKernel(grid, block, device_or_stream [, smem])`** — RAII kernel launcher that:
- Resolves the CUDA stream from a `DLDevice` via TVM-FFI automatically.
- Checks the CUDA error with file/line info after launch via `operator()(kernel, args...)`.
- Supports `.enable_pdl(bool)` for PDL (Programmatic Dependent Launch, SM90+).
- **`host::RuntimeDeviceCheck(cudaError_t)`** — Check a CUDA error; throw on failure.
### `tensor.h` — Tensor validation (`TensorMatcher`, Symbolic types)
```cpp
#include <sgl_kernel/tensor.h>
```
This is the **primary validation API** for all kernel launchers. Use it to validate every `tvm::ffi::TensorView` argument.
- **`host::SymbolicSize{"name"}`** — A named symbolic dimension. Call `.set_value(n)` to pin it, `.unwrap()` to extract after verification.
- **`host::SymbolicDType`** — Symbolic dtype. Use `.set_options<Ts...>()` to restrict allowed types.
- **`host::SymbolicDevice`** — Symbolic device. Use `.set_options<kDLCUDA>()` to restrict to CUDA.
- **`host::TensorMatcher({dims...})`** — Fluent builder for tensor validation:
- `.with_dtype<T>()` — require a specific C++ type (e.g. `fp16_t`)
- `.with_dtype<T1, T2, ...>()` — allow a set of types
- `.with_device<kDLCUDA>(device_sym)` — require CUDA and bind the checked device to a `SymbolicDevice`
- `.with_strides({strides...})` — validate strides (omit to require contiguous)
- `.verify(tensor_view)` — execute the check; throws `PanicError` with full context on failure; **chainable** (`verify(a).verify(b)` to check multiple tensors with the same shape)
**Typical pattern:**
```cpp
auto N = SymbolicSize{"num_elements"};
auto device = SymbolicDevice{};
device.set_options<kDLCUDA>();
TensorMatcher({N}) //
.with_dtype<fp16_t>()
.with_device<kDLCUDA>(device)
.verify(dst)
.verify(src); // same shape, dtype, device as dst
const size_t n = N.unwrap();
const DLDevice dev = device.unwrap();
```
### `type.cuh` — `dtype_trait<T>` and `packed_t<T>`
```cpp
#include <sgl_kernel/type.cuh>
```
- **`dtype_trait<T>`** — Static trait struct for each scalar type. Provides:
- `dtype_trait<T>::from(value)` — convert from another type (e.g. `fp32_t` → `fp16_t`)
- `dtype_trait<T>::abs/sqrt/rsqrt/exp/sin/cos(x)` — type-dispatched unary math (primarily for `fp32_t`)
- `dtype_trait<T>::max/min(x, y)` — type-dispatched binary math (primarily for `fp32_t`)
- **`packed_t<T>`** — Two-element packed alias: `packed_t<fp16_t>` = `fp16x2_t`, `packed_t<bf16_t>` = `bf16x2_t`, `packed_t<fp32_t>` = `fp32x2_t`. Use for vectorized loads/stores.
- **`device::cast<To, From>(value)`** — Type-safe cast using `dtype_trait`, e.g. `cast<fp32x2_t, fp16x2_t>(v)`.
### `vec.cuh` — Vectorized memory access (`AlignedVector`)
```cpp
#include <sgl_kernel/vec.cuh>
```
- **`device::AlignedVector<T, N>`** — Aligned storage for N elements of type T. N must be a power of two, `sizeof(T)*N <= 32`. Enables vectorized loads/stores for bandwidth efficiency. In terms of API/codegen constraints, the upper bound is 256-bit; in practice, 128-bit is the portable default, while 256-bit vectorization is typically only viable on `SM100+` and should be gated by an architecture check when needed.
- `.load(ptr, offset)` — vectorized load from `ptr[offset]`
- `.store(ptr, offset)` — vectorized store to `ptr[offset]`
- `.fill(value)` — fill all lanes
- `operator[](i)` — element access
### `tile.cuh` — `tile::Memory` (strided memory access pattern)
```cpp
#include <sgl_kernel/tile.cuh>
```
- `tile::Memory<T>` is fundamentally a **1D cooperative accessor** over a contiguous region.
- **`device::tile::Memory<T>::cta(blockDim.x)`** — Creates a tile accessor where each thread handles `tid = threadIdx.x` with stride `tsize` (for `cta(blockDim.x)`, this is `blockDim.x`). Common for loops over a 1D array.
- **`.load(ptr, offset)`** — loads `ptr[tid + offset * tsize]`
- **`.store(ptr, val, offset)`** — stores to `ptr[tid + offset * tsize]`
- **`.in_bound(n, offset)`** — boundary check
For a **2D tile**, either flatten `(row, col)` into a linear tile index first, or compute the address manually with `ptr[row * stride + col]` using your thread/block coordinates.
### `math.cuh` — Device math (`device::math::`)
```cpp
#include <sgl_kernel/math.cuh>
```
- `device::math::max/min<T>(a, b)` — type-dispatched binary math via `dtype_trait`
- `device::math::abs/sqrt/rsqrt/exp/sin/cos<T>(x)` — type-dispatched unary math via `dtype_trait`
### `warp.cuh` — Warp-level primitives
```cpp
#include <sgl_kernel/warp.cuh>
```
- `device::warp::reduce_sum<T>(value)` — warp-level sum reduction via `__shfl_xor_sync`
- `device::warp::reduce_max<T>(value)` — warp-level max reduction
### `cta.cuh` — CTA-level primitives
```cpp
#include <sgl_kernel/cta.cuh>
```
- `device::cta::reduce_max<T>(value, smem, min_value)` — CTA-wide max using shared memory + warp reduction. Caller is responsible for a `__syncthreads()` after if the result in `smem[0]` is needed.
### `atomic.cuh` — Atomic operations
```cpp
#include <sgl_kernel/atomic.cuh>
```
- `device::atomic::max(float* addr, float value)` — float atomic max (handles negative values correctly via bit tricks).
### `runtime.cuh` — Occupancy and device info
```cpp
#include <sgl_kernel/runtime.cuh>
```
- `host::runtime::get_blocks_per_sm(kernel, block_dim)` — max active blocks per SM (occupancy)
- `host::runtime::get_sm_count(device_id)` — number of SMs on the device
- `host::runtime::get_cc_major(device_id)` — compute capability major version
**Persistent kernel pattern** (cap blocks to SM count × occupancy):
```cpp
static const uint32_t max_occ = runtime::get_blocks_per_sm(kernel, kBlockSize);
static const uint32_t num_sm = runtime::get_sm_count(device.unwrap().device_id);
const auto num_blocks = std::min(num_sm * max_occ, div_ceil(n, kBlockSize));
LaunchKernel(num_blocks, kBlockSize, device.unwrap())(kernel, params);
```
---
## Step 0 (optional): Generate a `.clangd` config for better IDE support
```bash
python -m sglang.jit_kernel
```
---
## Step 1: Implement the CUDA kernel in `jit_kernel/csrc/`
Create `python/sglang/jit_kernel/csrc/elementwise/scale.cuh`.
The implementation fully uses the project abstractions described above:
```cpp
#include <sgl_kernel/tensor.h> // For TensorMatcher, SymbolicSize, SymbolicDevice
#include <sgl_kernel/type.cuh> // For dtype_trait, fp16_t, bf16_t, fp32_t
#include <sgl_kernel/utils.h> // For RuntimeCheck, div_ceil
#include <sgl_kernel/utils.cuh> // For LaunchKernel, SGL_DEVICE
#include <sgl_kernel/vec.cuh> // For AlignedVector
#include <dlpack/dlpack.h>
#include <tvm/ffi/container/tensor.h>
namespace {
// ----------------------------------------------------------------
// Kernel: element-wise scale using vectorized 128-bit loads/stores
// T = fp16_t | bf16_t | fp32_t
// kVecN = number of elements per vector load (e.g. 8 for fp16)
// factor = runtime scale factor
// ----------------------------------------------------------------
template <typename T, int kVecN>
__global__ void scale_kernel(T* __restrict__ dst,
const T* __restrict__ src,
float factor,
uint32_t n_total) {
using vec_t = device::AlignedVector<T, kVecN>;
const uint32_t n_vecs = n_total / kVecN;
// --- vectorised body ---
const uint32_t vec_stride = blockDim.x * gridDim.x;
for (uint32_t vi = blockIdx.x * blockDim.x + threadIdx.x;
vi < n_vecs;
vi += vec_stride) {
vec_t v;
v.load(src, vi);
#pragma unroll
for (int i = 0; i < kVecN; ++i) {
v[i] = static_cast<T>(static_cast<float>(v[i]) * factor);
}
v.store(dst, vi);
}
// --- scalar tail ---
const uint32_t base = n_vecs * kVecN;
const uint32_t scalar_stride = blockDim.x * gridDim.x;
for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
base + i < n_total;
i += scalar_stride) {
dst[base + i] = static_cast<T>(static_cast<float>(src[base + i]) * factor);
}
}
// ----------------------------------------------------------------
// Launcher: validates tensors, selects vector width, launches kernel
// ----------------------------------------------------------------
template <typename T>
void scale(tvm::ffi::TensorView dst, tvm::ffi::TensorView src, float factor) {
using namespace host;
// 1. Validate input tensors with TensorMatcher
SymbolicSize N = {"num_elements"};
SymbolicDevice device_;
device_.set_options<kDLCUDA>();
TensorMatcher({N}) //
.with_dtype<T>()
.with_device<kDLCUDA>(device_)
.verify(dst)
.verify(src); // same shape / dtype / device as dst
const uint32_t n = static_cast<uint32_t>(N.unwrap());
const DLDevice device = device_.unwrap();
RuntimeCheck(n > 0, "scale: num_elements must be > 0, got ", n);
// 2. Choose vector width for 128-bit loads (16 bytes)
// fp16/bf16: 8 elements × 2 bytes = 16 bytes
// fp32: 4 elements × 4 bytes = 16 bytes
constexpr int kVecN = 16 / sizeof(T);
const uint32_t n_work_items = div_ceil(n, static_cast<uint32_t>(kVecN));
// 3. Launch
constexpr uint32_t kBlockSize = 256;
const uint32_t grid = div_ceil(n_work_items, kBlockSize);
LaunchKernel(grid, kBlockSize, device)(
scale_kernel<T, kVecN>,
static_cast<T*>(dst.data_ptr()),
static_cast<const T*>(src.data_ptr()),
factor,
n);
}
} // namespace
```
**Key points:**
- Include headers from `sgl_kernel/` — **not** raw CUDA headers for anything already covered
- Add a short trailing `// For ...` explanation to every `#include <sgl_kernel/...>` line
- Use `TensorMatcher` for all tensor validation; never manually check shape/dtype/device
- Use `AlignedVector` for vectorised 128-bit loads/stores — significant bandwidth win
- Use `LaunchKernel` — it resolves the stream and checks errors automatically
- Use `RuntimeCheck` for runtime assertions with useful error messages
- Prefer passing runtime scalars like `factor` directly unless compile-time specialisation is genuinely required
- `fp16_t` / `bf16_t` / `fp32_t` are the project's type aliases (from `utils.cuh`)
- `device::cast<To, From>` or `dtype_trait<T>::from(val)` for cross-type conversions
- `device::math::` functions for device math instead of bare `__` intrinsics
---
## Step 2: Add the Python wrapper in `jit_kernel/`
Create `python/sglang/jit_kernel/scale.py`:
```python
from __future__ import annotations
from typing import TYPE_CHECKING
import torch
from sglang.jit_kernel.utils import cache_once, load_jit, make_cpp_args
if TYPE_CHECKING:
from tvm_ffi.module import Module
@cache_once
def _jit_scale_module(dtype: torch.dtype) -> Module:
"""Compile and cache the JIT scale module for a given dtype."""
args = make_cpp_args(dtype)
return load_jit(
"scale",
*args,
cuda_files=["elementwise/scale.cuh"],
cuda_wrappers=[("scale", f"scale<{args}>")],
)
def scale(src: torch.Tensor, factor: float, out: torch.Tensor | None = None) -> torch.Tensor:
"""
Element-wise scale: dst = src * factor.
Supported dtypes: torch.float16, torch.bfloat16, torch.float32.
Parameters
----------
src : CUDA tensor (FP16 / BF16 / FP32)
factor : scale factor
out : optional pre-allocated output tensor (same shape/dtype as src)
Returns
-------
Scaled tensor (dst = src * factor).
"""
if not src.is_cuda:
raise RuntimeError("src must be a CUDA tensor")
if src.dtype not in (torch.float16, torch.bfloat16, torch.float32):
raise RuntimeError(
f"Unsupported dtype {src.dtype}. Supported: float16, bfloat16, float32"
)
if out is None:
out = torch.empty_like(src)
else:
if out.shape != src.shape:
raise RuntimeError("out shape must match src")
if out.dtype != src.dtype:
raise RuntimeError("out dtype must match src")
if out.device != src.device:
raise RuntimeError("out device must match src")
# Keep the Python wrapper thin, but still enforce the basic preconditions
# that the current JIT/FFI path does not reject safely on its own.
module = _jit_scale_module(src.dtype)
module.scale(out, src, factor)
return out
```
**Key points:**
- Use `cache_once` — **not** `functools.lru_cache` (incompatible with `torch.compile`)
- `load_jit` first arg(s) form the unique build marker; same marker = same cached binary
- Only include compile-time specialisation knobs in the build marker; runtime values like `factor` should stay runtime unless the kernel truly needs templating
- `cuda_wrappers`: `(export_name, kernel_symbol)` — `export_name` is called from Python
- `make_cpp_args(dtype, ...)` converts `torch.dtype` to C++ type alias:
- Keep Python launchers thin, but still validate the basic invariants (`is_cuda`, supported dtype, `out` metadata). In the current JIT/FFI path, invalid tensors are not always rejected safely before launch
| `torch.dtype` | C++ type |
|--------------------|------------|
| `torch.float16` | `fp16_t` |
| `torch.bfloat16` | `bf16_t` |
| `torch.float32` | `fp32_t` |
---
## Step 3 (optional): Tune JIT build flags
```python
return load_jit(
"scale",
*args,
cuda_files=["elementwise/scale.cuh"],
cuda_wrappers=[("scale", f"scale<{args}>")],
extra_cuda_cflags=["-O3", "--use_fast_math"],
)
```
If your kernel requires SM90+, raise a clear Python error before calling `load_jit`:
```python
if torch.cuda.get_device_capability()[0] < 9:
raise RuntimeError("This kernel requires SM90 (Hopper) or later")
```
---
## Step 4: Write tests (required)
Create `python/sglang/jit_kernel/tests/test_scale.py`:
```python
import pytest
import torch
from sglang.jit_kernel.scale import scale
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
@pytest.mark.parametrize("size", [1, 127, 128, 1024, 4097]) # cover tail remainder
@pytest.mark.parametrize("factor", [0.5, 1.0, 2.0, 3.0])
def test_scale_correctness(dtype, size, factor):
src = torch.randn(size, dtype=dtype, device="cuda")
out = scale(src, factor)
expected = src * factor
rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-2, 1e-2)
torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
def test_scale_out_param(dtype):
src = torch.randn(1024, dtype=dtype, device="cuda")
out = torch.empty_like(src)
result = scale(src, 2.0, out=out)
assert result is out
torch.testing.assert_close(out, src * 2.0, rtol=1e-2, atol=1e-2)
def test_scale_cpu_error():
src = torch.randn(128, dtype=torch.float16) # CPU tensor
with pytest.raises(RuntimeError, match="CUDA"):
scale(src, 2.0)
def test_scale_unsupported_dtype():
src = torch.randint(0, 10, (128,), dtype=torch.int32, device="cuda")
with pytest.raises(RuntimeError, match="dtype"):
scale(src, 2.0)
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])
```
---
## Step 5: Add a benchmark (required)
Create `python/sglang/jit_kernel/benchmark/bench_scale.py`:
```python
import itertools
import torch
import triton
import triton.testing
from sglang.jit_kernel.benchmark.utils import (
DEFAULT_DEVICE,
DEFAULT_DTYPE,
get_benchmark_range,
run_benchmark,
)
from sglang.jit_kernel.scale import scale as jit_scale
SIZE_LIST = get_benchmark_range(
full_range=[2**n for n in range(10, 20)], # 1K … 512K elements
ci_range=[4096, 65536],
)
configs = list(itertools.product(SIZE_LIST))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["size"],
x_vals=configs,
line_arg="provider",
line_vals=["jit", "torch"],
line_names=["SGL JIT Kernel", "PyTorch"],
styles=[("blue", "-"), ("red", "--")],
ylabel="us",
plot_name="scale-performance",
args={},
)
)
def benchmark(size: int, provider: str):
src = torch.randn(size, dtype=DEFAULT_DTYPE, device=DEFAULT_DEVICE)
factor = 2.0
if provider == "jit":
fn = lambda: jit_scale(src, factor)
else:
fn = lambda: src * factor
return run_benchmark(fn)
if __name__ == "__main__":
benchmark.run(print_data=True)
```
Run:
```bash
python python/sglang/jit_kernel/benchmark/bench_scale.py
```
---
## Troubleshooting
- **JIT compilation fails**: ensure the `.cuh` file is under `python/sglang/jit_kernel/csrc/`; reduce template argument combinations
- **CUDA crash / illegal memory access**: `CUDA_LAUNCH_BLOCKING=1`; `compute-sanitizer --tool memcheck python ...`
- **Unstable benchmark results**: `run_benchmark` uses CUDA-graph-based timing by default
---
## References
- `docs/developer_guide/development_jit_kernel_guide.md`
- `python/sglang/jit_kernel/utils.py` — `cache_once`, `load_jit`, `make_cpp_args`
- `python/sglang/jit_kernel/include/sgl_kernel/tensor.h` — `TensorMatcher`, `SymbolicSize/DType/Device`
- `python/sglang/jit_kernel/include/sgl_kernel/utils.cuh` — type aliases, `LaunchKernel`, `SGL_DEVICE`
- `python/sglang/jit_kernel/include/sgl_kernel/vec.cuh` — `AlignedVector`
- `python/sglang/jit_kernel/include/sgl_kernel/tile.cuh` — `tile::Memory`
- `python/sglang/jit_kernel/include/sgl_kernel/type.cuh` — `dtype_trait`, `packed_t`, `device::cast`
- `python/sglang/jit_kernel/include/sgl_kernel/math.cuh` — `device::math::`
- `python/sglang/jit_kernel/include/sgl_kernel/warp.cuh` — `warp::reduce_sum/max`
- `python/sglang/jit_kernel/include/sgl_kernel/cta.cuh` — `cta::reduce_max`
- `python/sglang/jit_kernel/include/sgl_kernel/atomic.cuh` — `atomic::max`
- `python/sglang/jit_kernel/include/sgl_kernel/runtime.cuh` — occupancy / SM count helpers
- `python/sglang/jit_kernel/csrc/add_constant.cuh` — minimal runnable reference
- `python/sglang/jit_kernel/csrc/elementwise/rmsnorm.cuh` — real example using `TensorMatcher` + `LaunchKernel` + `tile::Memory`
- `python/sglang/jit_kernel/csrc/elementwise/qknorm.cuh` — real example using `runtime::get_blocks_per_sm` + persistent kernel pattern
- `python/sglang/jit_kernel/benchmark/utils.py` — benchmark helpers
## Summary of Files Created
```
python/sglang/jit_kernel/csrc/elementwise/scale.cuh # NEW: CUDA kernel
python/sglang/jit_kernel/scale.py # NEW: Python wrapper
python/sglang/jit_kernel/tests/test_scale.py # NEW: Tests
python/sglang/jit_kernel/benchmark/bench_scale.py # NEW: Benchmark
```
================================================
FILE: .claude/skills/add-sgl-kernel/SKILL.md
================================================
---
name: add-sgl-kernel
description: Step-by-step tutorial for adding a heavyweight AOT CUDA/C++ kernel to sgl-kernel (including tests & benchmarks)
---
# Tutorial: Adding a New Kernel to `sgl-kernel` (AOT / Heavyweight)
This tutorial walks through adding a simple element-wise scale operation as an AOT kernel. We'll implement `scale(x, factor) = x * factor` to demonstrate the complete workflow.
## Goal
Add a new operation that scales each element of a tensor by a scalar factor:
- Input: tensor `x` (CUDA) and scalar `factor` (float)
- Output: `x * factor` (element-wise, in-place or into pre-allocated `out`)
- Supported dtypes: **FP16 (`torch.float16`), BF16 (`torch.bfloat16`), FP32 (`torch.float32`)**
- Dispatched via `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` macro (defined in `sgl-kernel/include/utils.h`)
## Two rules of thumb (must follow)
1. **Prefer `python/sglang/jit_kernel` first** when the kernel does **not** depend on CUTLASS or another large C++ project. This is the default path for lightweight kernels that benefit from rapid iteration.
2. **Prefer `sgl-kernel`** when the kernel **does** depend on CUTLASS or another large C++ project, or when it should be part of the AOT wheel / torch op registration flow.
3. **Exception**: if the dependency is `flashinfer`, or CUTLASS that is already provided through `flashinfer`, the kernel can still be implemented as `jit_kernel`.
In addition, every new kernel must ship with:
- **Tests** (pytest)
- **A benchmark script** (triton.testing)
---
## Repository integration map
You will typically touch these files/areas:
- Implementation: `sgl-kernel/csrc/elementwise/scale.cu` (pick the right subdirectory)
- Public declarations: `sgl-kernel/include/sgl_kernel_ops.h`
- Torch extension registration: `sgl-kernel/csrc/common_extension.cc`
- Build: `sgl-kernel/CMakeLists.txt` (`set(SOURCES ...)`)
- Python API: `sgl-kernel/python/sgl_kernel/` and `sgl-kernel/python/sgl_kernel/__init__.py`
- Tests: `sgl-kernel/tests/test_scale.py`
- Benchmarks: `sgl-kernel/benchmark/bench_scale.py`
---
## Step 1: Implement the kernel in `csrc/`
Pick the right subdirectory:
- `csrc/elementwise/` — for element-wise ops (our example)
- `csrc/gemm/`, `csrc/attention/`, `csrc/moe/` — for other categories
Create `sgl-kernel/csrc/elementwise/scale.cu`:
```cpp
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/all.h>
#include "utils.h" // DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16
// scale_kernel: out[i] = input[i] * factor
// Supports float, half (__half), __nv_bfloat16 via template T
template <typename T>
__global__ void scale_kernel(T* __restrict__ out,
const T* __restrict__ input,
float factor,
int64_t n) {
int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = static_cast<T>(static_cast<float>(input[idx]) * factor);
}
}
void scale(at::Tensor& out, const at::Tensor& input, double factor) {
TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor");
TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
TORCH_CHECK(out.is_cuda(), "out must be a CUDA tensor");
TORCH_CHECK(out.is_contiguous(), "out must be contiguous");
TORCH_CHECK(out.sizes() == input.sizes(), "out and input must have the same shape");
TORCH_CHECK(out.scalar_type() == input.scalar_type(),
"out and input must have the same dtype");
const int64_t n = input.numel();
const int threads = 256;
const int blocks = (n + threads - 1) / threads;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
// Dispatches over float, float16, bfloat16
DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16(input.scalar_type(), c_type, [&] {
scale_kernel<c_type><<<blocks, threads, 0, stream>>>(
static_cast<c_type*>(out.data_ptr()),
static_cast<const c_type*>(input.data_ptr()),
static_cast<float>(factor),
n);
cudaError_t status = cudaGetLastError();
TORCH_CHECK(status == cudaSuccess,
"scale_kernel launch failed: ", cudaGetErrorString(status));
return true;
});
}
```
**Key points:**
- Use `at::Tensor` (PyTorch tensors), `TORCH_CHECK` for validation, `at::cuda::getCurrentCUDAStream()` for stream
- Keep Python wrappers thin; do shape/dtype/device validation in C++ right around the launch path
- `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` covers `float`, `half` (FP16), `__nv_bfloat16` (BF16)
- Add device error checking after every kernel launch
- If a kernel only works on certain architectures, enforce that with `TORCH_CHECK` and skip logic in tests
---
## Step 2: Add a C++ declaration in `include/sgl_kernel_ops.h`
Edit `sgl-kernel/include/sgl_kernel_ops.h`, add to the elementwise section:
```cpp
void scale(at::Tensor& out, const at::Tensor& input, double factor);
```
---
## Step 3: Register the op in `csrc/common_extension.cc`
Edit `sgl-kernel/csrc/common_extension.cc`, inside `TORCH_LIBRARY_FRAGMENT(sgl_kernel, m)`:
```cpp
// From csrc/elementwise
m.def("scale(Tensor! out, Tensor input, float factor) -> ()");
m.impl("scale", torch::kCUDA, &scale);
```
**Key points:**
- `Tensor!` means in-place / mutable output argument
- The schema is important for `torch.compile` and for consistent call signatures
- Keep the torch schema in PyTorch scalar types (`float` here), but note that the C++ launcher signature still needs `double` for scalar arguments accepted by `torch::Library`
---
## Step 4: Add the new source file to `CMakeLists.txt`
Edit `sgl-kernel/CMakeLists.txt`, add to `set(SOURCES ...)`:
```cmake
csrc/elementwise/scale.cu
```
**Key points:**
- Keep the list **alphabetically sorted** (the file explicitly requires this)
- If the kernel has arch constraints, reflect that in tests/benchmarks via skip logic
---
## Step 5: Expose a Python API under `sgl-kernel/python/sgl_kernel/`
Prefer following the existing module organization first. For elementwise kernels, the usual pattern is:
- implement the Python wrapper in `sgl-kernel/python/sgl_kernel/elementwise.py`
- then re-export it from `sgl-kernel/python/sgl_kernel/__init__.py`
For example, in `sgl-kernel/python/sgl_kernel/elementwise.py`, add:
```python
import torch
def scale(
input: torch.Tensor,
factor: float,
out: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Element-wise scale: out = input * factor.
Supported dtypes: torch.float16, torch.bfloat16, torch.float32.
Parameters
----------
input : CUDA input tensor
factor : scale factor (float)
out : optional pre-allocated CUDA output tensor (same shape/dtype as input)
"""
if out is None:
out = torch.empty_like(input)
torch.ops.sgl_kernel.scale.default(out, input, factor)
return out
```
Then re-export it from `sgl-kernel/python/sgl_kernel/__init__.py` following the existing import style used by other kernels.
---
## Step 6: Write tests (required)
Create `sgl-kernel/tests/test_scale.py`:
```python
import pytest
import torch
import sgl_kernel
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
@pytest.mark.parametrize("size", [128, 1024, 4096, 65536])
@pytest.mark.parametrize("factor", [0.5, 1.0, 2.0])
def test_scale_correctness(dtype, size, factor):
input = torch.randn(size, dtype=dtype, device="cuda")
out = torch.empty_like(input)
result = sgl_kernel.scale(input, factor, out=out)
assert result is out
expected = input * factor
rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-2, 1e-2)
torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
def test_scale_shape_mismatch():
input = torch.randn(128, dtype=torch.float16, device="cuda")
out = torch.empty(256, dtype=torch.float16, device="cuda")
with pytest.raises(RuntimeError, match="same shape"):
sgl_kernel.scale(input, 2.0, out=out)
def test_scale_cpu_input():
input = torch.randn(128, dtype=torch.float16) # CPU
out = torch.empty_like(input)
with pytest.raises(RuntimeError, match="CUDA"):
sgl_kernel.scale(input, 2.0, out=out)
if __name__ == "__main__":
pytest.main([__file__, "-q"])
```
---
## Step 7: Add a benchmark (required)
Create `sgl-kernel/benchmark/bench_scale.py`:
```python
import itertools
import os
import torch
import triton
import triton.testing
import sgl_kernel
IS_CI = (
os.getenv("CI", "false").lower() == "true"
or os.getenv("GITHUB_ACTIONS", "false").lower() == "true"
)
dtypes = [torch.float16] if IS_CI else [torch.float16, torch.bfloat16, torch.float32]
sizes = [4096] if IS_CI else [2**n for n in range(10, 20)] # 1K … 512K
factors = [2.0]
configs = list(itertools.product(dtypes, sizes))
def torch_scale(input: torch.Tensor, factor: float) -> torch.Tensor:
return input * factor
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["dtype", "size"],
x_vals=configs,
line_arg="provider",
line_vals=["sglang", "torch"],
line_names=["SGL Kernel", "PyTorch"],
styles=[("green", "-"), ("red", "--")],
ylabel="µs (median)",
plot_name="scale-performance",
args={},
)
)
def benchmark(dtype, size, provider):
input = torch.randn(size, dtype=dtype, device="cuda")
out = torch.empty_like(input)
factor = 2.0
if provider == "sglang":
fn = lambda: sgl_kernel.scale(input, factor, out=out)
else:
fn = lambda: torch_scale(input, factor)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
fn, quantiles=[0.5, 0.2, 0.8]
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
benchmark.run(print_data=True)
```
---
## Step 8: Build
Build:
```bash
cd sgl-kernel
make build -j16
```
If you need to limit host resource usage:
```bash
cd sgl-kernel
make build -j1 MAX_JOBS=2 CMAKE_ARGS="-DSGL_KERNEL_COMPILE_THREADS=1"
```
---
## Step 9: Validate
After building successfully, run the test and benchmark:
```bash
pytest sgl-kernel/tests/test_scale.py -q
python sgl-kernel/benchmark/bench_scale.py
```
---
## Troubleshooting
- **Async CUDA errors**: `CUDA_LAUNCH_BLOCKING=1`
- **Memory errors**: `compute-sanitizer --tool memcheck python ...`
- **Build is too slow / OOM**: reduce `MAX_JOBS` and `SGL_KERNEL_COMPILE_THREADS`
- **Binary bloat**: use `sgl-kernel/analyze_whl_kernel_sizes.py`
- **CMake sources list**: if your `.cu` file is missing from `SOURCES`, the symbol will be undefined at link time
---
## References
- `sgl-kernel/README.md`
- `sgl-kernel/include/sgl_kernel_ops.h`
- `sgl-kernel/csrc/common_extension.cc`
- `sgl-kernel/CMakeLists.txt`
- `sgl-kernel/include/utils.h` — `DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FLOAT_FP16` macro and friends
- `sgl-kernel/csrc/elementwise/activation.cu` — reference for the FP16/BF16/FP32 dispatch pattern
## Summary of Files Created/Modified
```
sgl-kernel/csrc/elementwise/scale.cu # NEW: CUDA kernel + launcher
sgl-kernel/include/sgl_kernel_ops.h # MODIFIED: C++ declaration
sgl-kernel/csrc/common_extension.cc # MODIFIED: schema + dispatch registration
sgl-kernel/CMakeLists.txt # MODIFIED: add source file (alphabetical)
sgl-kernel/python/sgl_kernel/elementwise.py # MODIFIED: Python wrapper
sgl-kernel/python/sgl_kernel/__init__.py # MODIFIED: re-export Python API
sgl-kernel/tests/test_scale.py # NEW: tests
sgl-kernel/benchmark/bench_scale.py # NEW: benchmark
```
================================================
FILE: .claude/skills/sglang-bisect-ci-regression/SKILL.md
================================================
# SGLang Bisect CI Regression
Investigate a consistently failing CI test to find the root cause - whether it's a code regression from a specific PR, a hardware/runner-specific issue, or an environment change. Optionally reproduce the failure on a remote GPU server.
## Slash Command
`/sglang-bisect-ci-regression <test_name_or_ci_url> [ssh_target] [docker_container]`
## When to Use This Skill
- A CI test is failing consistently on main (scheduled runs)
- You need to find which PR introduced a regression
- You suspect a runner-specific or GPU-specific issue
- You want to reproduce a CI failure on a remote server
## Arguments
- **First argument (required)**: Test file name (e.g. `test_lora_tp.py`) or a GitHub Actions job URL
- **Second argument (optional)**: SSH target for remote reproduction (e.g. `user@host`)
- **Third argument (optional)**: Docker container name on the SSH target (e.g. `sglang_dev`)
If SSH target and docker container are not provided, the skill will only perform the CI log analysis and bisection, without remote reproduction. **Ask the user** for these if reproduction is needed and they weren't provided.
## Background: Scheduled CI Runs
SGLang uses the `pr-test.yml` workflow with **scheduled runs** (cron-triggered) to periodically test the `main` branch. These runs are the primary data source for detecting regressions:
- **Workflow**: `pr-test.yml` with `event: schedule`
- **Branch**: `main`
- **Dashboard**: https://github.com/sgl-project/sglang/actions/workflows/pr-test.yml?query=event%3Aschedule
- **Frequency**: Runs multiple times daily, each pinned to the HEAD of `main` at trigger time
- **Purpose**: Catches regressions that slip through PR-level CI (e.g., interaction bugs between merged PRs, hardware-specific issues)
Always use these scheduled runs (not PR-triggered runs) when bisecting regressions on `main`. The `--event schedule` filter in `gh run list` ensures you only see these periodic main-branch runs.
## Workflow
### Phase 1: Extract the Failure Signature
1. **Get the failing test details from CI logs.** If given a URL, fetch logs directly. If given a test name, find recent scheduled runs of `pr-test.yml` on `main` that failed:
```bash
# List recent scheduled runs targeting main (the primary source of truth for regressions)
# These are cron-triggered runs visible at:
# https://github.com/sgl-project/sglang/actions/workflows/pr-test.yml?query=event%3Aschedule
gh run list --repo sgl-project/sglang --workflow="pr-test.yml" --event schedule --branch main --limit 20 --json databaseId,conclusion,createdAt,headSha
# Find the job containing the test
gh run view {RUN_ID} --repo sgl-project/sglang --json jobs --jq '.jobs[] | select(.conclusion == "failure") | {name, conclusion, databaseId}'
# Get the failure details
gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E -B 5 -A 30 "AssertionError|FAIL|Error|{TEST_NAME}"
```
2. **Record the failure signature:**
- Exact error message and assertion
- Affected test method name
- Model/config involved
- Numeric values (e.g., tolerance diffs, scores)
- Whether the failure is deterministic (same values across runs)
### Phase 2: Temporal Bisection
3. **Find the boundary between passing and failing runs.** Walk through the scheduled run history (from the `pr-test.yml` schedule runs on `main`) to identify:
- Last known PASSING run (sha + date)
- First known FAILING run (sha + date)
```bash
# For each scheduled run, check the specific partition/job status
gh run view {RUN_ID} --repo sgl-project/sglang --json jobs --jq '.jobs[] | select(.name == "{JOB_NAME}") | {conclusion, databaseId}'
# Verify a specific test passed or failed in a run
gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "{TEST_NAME}|PASSED|FAILED|logprobs mismatch" | head -10
```
4. **List commits between the boundary:**
```bash
git log --oneline {LAST_PASS_SHA}..{FIRST_FAIL_SHA}
```
5. **Filter for relevant commits** that touch files related to the failing test (model layers, kernels, test utilities, etc.):
```bash
git log --oneline {LAST_PASS_SHA}..{FIRST_FAIL_SHA} -- {relevant_paths}
```
### Phase 3: Runner/Hardware Analysis
6. **Check if the failure is runner-specific.** Extract the runner identity from each failing and passing run:
```bash
# Get runner name and machine
gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "Runner name|Machine name" | head -5
# Get GPU/driver info
gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -i -E "NVIDIA-SMI|Driver Version|CUDA Version" | head -5
# Get package versions
gh run view {RUN_ID} --repo sgl-project/sglang --job {JOB_ID} --log 2>&1 | grep -E "sgl.kernel.*==|flashinfer.*==" | head -5
```
7. **Correlate runners with pass/fail outcomes.** Build a table:
| Run ID | Date | Runner | GPU Type | Driver | Result |
|--------|------|--------|----------|--------|--------|
If all failures map to a specific runner type/GPU and all passes map to another, the issue is **hardware-specific**, not a code regression.
### Phase 4: Code Analysis
8. **If a code regression is suspected** (failures not runner-specific), examine the candidate commits:
- Read the changed files
- Understand how the changes could affect the failing test
- Look for prefill-vs-decode differences, TP-specific paths, kernel changes
9. **If a hardware issue is suspected**, analyze:
- Kernel compatibility (CUDA compute capability)
- Driver version differences
- All-reduce / NCCL behavior differences
- CUDA graph capture differences across GPU architectures
### Phase 5: Remote Reproduction (Optional)
Only if SSH target and docker container were provided.
10. **Verify the remote environment:**
```bash
ssh {SSH_TARGET} "docker exec {CONTAINER} nvidia-smi --query-gpu=name,driver_version --format=csv"
ssh {SSH_TARGET} "docker exec {CONTAINER} pip show sgl-kernel sglang flashinfer-python 2>&1 | grep -E 'Name:|Version:'"
```
11. **Ensure latest code is installed.** If the container is stale, update:
```bash
# Try fetching latest main
ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /path/to/sglang && git fetch origin main && git checkout origin/main'"
# Or download and install from tarball if git auth fails
ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /tmp && curl -L https://github.com/sgl-project/sglang/archive/refs/heads/main.tar.gz | tar xz && cd sglang-main && pip install -e \"python[all]\"'"
# Reinstall (after git fetch)
ssh {SSH_TARGET} "docker exec {CONTAINER} bash -c 'cd /path/to/sglang && pip install -e \"python[all]\"'"
# Install test dependencies if needed
ssh {SSH_TARGET} "docker exec {CONTAINER} pip install peft rouge-score"
```
12. **Create a minimal reproduction script** that:
- Uses `if __name__ == '__main__'` with `mp.set_start_method("spawn")`
- Runs the specific failing test configuration
- Prints key metrics (diffs, scores, outputs)
- Exits with code 1 on failure
13. **Copy and run the reproduction script:**
```bash
scp /tmp/repro_script.py {SSH_TARGET}:/tmp/
ssh {SSH_TARGET} "docker cp /tmp/repro_script.py {CONTAINER}:/tmp/"
ssh {SSH_TARGET} "docker exec -e CUDA_VISIBLE_DEVICES=0,1 {CONTAINER} python3 /tmp/repro_script.py"
```
14. **Run control experiments** to isolate the variable:
- If suspecting TP issue: run with TP=1 as control
- If suspecting GPU issue: compare same code on different GPU
- If suspecting a specific commit: test before/after that commit
### Phase 6: Report
15. **Produce a structured report:**
```markdown
## CI Regression Bisection Report
### Failure Signature
- **Test**: {test_file}::{test_method}
- **Error**: {exact error message}
- **Key metrics**: {numeric values}
- **Deterministic**: Yes/No
### Root Cause Classification
One of:
- **Code Regression**: PR #{number} introduced the bug
- **Hardware-Specific**: Fails on {GPU_TYPE}, passes on others
- **Environment Change**: New runner/driver/package version
- **Pre-existing Flakiness**: Intermittent, not a new regression
### Evidence
| Condition | Result |
|-----------|--------|
| {condition1} | PASS/FAIL |
| {condition2} | PASS/FAIL |
### Timeline
- {date}: Last known pass ({sha}, {runner})
- {date}: First known fail ({sha}, {runner})
- {date}: Confirmed reproduction on {server}
### Recommended Fix
- **Short-term**: {workaround}
- **Long-term**: {proper fix}
```
## Key Patterns to Recognize
| Pattern | Diagnosis |
|---------|-----------|
| Same SHA passes on runner A, fails on runner B | Hardware/runner-specific |
| All runners fail after commit X | Code regression from commit X |
| Intermittent - same runner sometimes passes/fails | Flaky test or race condition |
| Prefill OK but decode fails | TP/all-reduce issue in decode path |
| Works with TP=1, fails with TP>1 | Tensor parallelism bug |
| Exact same numeric diff every time | Deterministic bug, not flakiness |
## Important Notes
- **Always check runner identity** before concluding it's a code regression. Many "consistent" failures are actually runner-specific.
- **Test partition assignments change over time** as tests are added/removed. A test may move between partitions, landing on different runner types.
- **H200 runners** use `/root/actions-runner/` path and machine names like `gpu-h200-worker-*`. Non-H200 runners use `/public_sglang_ci/runner-*` paths.
- When running remote reproduction, use `run_in_background` for long-running tests and check output with `TaskOutput`.
- Container environments may be stale - always verify package versions match CI before drawing conclusions.
================================================
FILE: .claude/skills/write-sglang-test/SKILL.md
================================================
---
name: write-sglang-test
description: Guide for writing SGLang CI/UT tests following project conventions. Covers CustomTestCase, CI registration, server fixtures, model selection, and test placement. Use when creating new tests, adding CI test cases, writing unit tests, or when the user asks to add tests for SGLang features.
---
# Writing SGLang CI / UT Tests
## Core Rules
1. **Always use `CustomTestCase`** — never raw `unittest.TestCase`
2. **Place tests in `test/registered/<category>/`** — only use `test/manual/` for debugging / non-CI tests
3. **Reuse server fixtures** — inherit from `DefaultServerBase` or write `setUpClass`/`tearDownClass` with `popen_launch_server`
4. **Smallest model for model-agnostic functionality** — use `DEFAULT_SMALL_MODEL_NAME_FOR_TEST` (Llama-3.2-1B-Instruct) for basic features that don't depend on model size
5. **8B for general performance** — use `DEFAULT_MODEL_NAME_FOR_TEST` (Llama-3.1-8B-Instruct, single-node) for performance tests that don't involve spec / DP / parallelism
6. **Bigger features → discuss case by case** — spec, DP attention, tensor/pipeline parallelism etc. may need multi-GPU suites and specific models
---
## Test File Template
### Functional correctness test (small model)
```python
import unittest
import requests
from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
register_cuda_ci(est_time=60, suite="stage-b-test-small-1-gpu")
class TestMyFeature(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--arg1", "value1"], # feature-specific args
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_basic_functionality(self):
response = requests.post(
self.base_url + "/generate",
json={"text": "Hello", "sampling_params": {"max_new_tokens": 32}},
)
self.assertEqual(response.status_code, 200)
if __name__ == "__main__":
unittest.main(verbosity=3)
```
### General performance test (8B model, single node, no spec/DP/parallelism)
```python
import time
import unittest
import requests
from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu")
class TestMyFeaturePerf(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_latency(self):
start = time.perf_counter()
response = requests.post(
self.base_url + "/generate",
json={"text": "Hello", "sampling_params": {"max_new_tokens": 128}},
)
elapsed = time.perf_counter() - start
self.assertEqual(response.status_code, 200)
self.assertLess(elapsed, 5.0, "Latency exceeded threshold")
if __name__ == "__main__":
unittest.main(verbosity=3)
```
---
## Server Fixture Reuse
For tests that only need a standard server, inherit from `DefaultServerBase` and override class attributes:
```python
from sglang.test.server_fixtures.default_fixture import DefaultServerBase
class TestMyFeature(DefaultServerBase):
model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
other_args = ["--enable-my-feature"]
def test_something(self):
...
```
Available fixtures in `python/sglang/test/server_fixtures/`:
| Fixture | Use case |
|---------|----------|
| `DefaultServerBase` | Standard single-server tests |
| `EagleServerBase` | EAGLE speculative decoding |
| `PDDisaggregationServerBase` | Disaggregated prefill/decode |
| `MMMUServerBase` | Multimodal VLM tests |
---
## CI Registration
Every test file in `test/registered/` **must** call a registration function at module level:
```python
from sglang.test.ci.ci_register import register_cuda_ci, register_amd_ci
register_cuda_ci(est_time=60, suite="stage-b-test-small-1-gpu")
register_amd_ci(est_time=60, suite="stage-b-test-small-1-gpu-amd") # optional
```
Parameters:
- `est_time`: estimated runtime in seconds (used for CI partitioning)
- `suite`: which CI suite to run in (see below)
- `nightly=True`: for nightly-only tests (default `False` = per-commit)
- `disabled="reason"`: temporarily disable with explanation
### Suite selection guide
**Default cases (1 GPU):**
| Scenario | Model | Suite |
|----------|-------|-------|
| Model-agnostic basic functionality | 1B (smallest) | `stage-b-test-small-1-gpu` |
| General performance (no spec/DP/parallelism) | 8B | `stage-b-test-large-1-gpu` |
**Bigger features (case by case):**
| Scenario | Suite |
|----------|-------|
| 2 GPU (e.g. TP=2) | `stage-b-test-large-2-gpu` |
| 4 GPU (H100) | `stage-c-test-4-gpu-h100` |
| 8 GPU (H200) | `stage-c-test-8-gpu-h200` |
| Nightly, 1 GPU | `nightly-1-gpu` |
| Nightly, 8 GPU | `nightly-8-gpu` |
For spec, DP attention, parallelism, disaggregation, etc., discuss with the team to determine the appropriate suite and GPU configuration.
---
## Model Constants
All defined in `python/sglang/test/test_utils.py`:
| Constant | Model | When to use |
|----------|-------|-------------|
| `DEFAULT_SMALL_MODEL_NAME_FOR_TEST` | Llama-3.2-1B-Instruct | Model-agnostic basic functionality |
| `DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE` | Llama-3.2-1B | Base (non-instruct) model tests |
| `DEFAULT_MODEL_NAME_FOR_TEST` | Llama-3.1-8B-Instruct | General performance (single node) |
| `DEFAULT_MOE_MODEL_NAME_FOR_TEST` | Mixtral-8x7B-Instruct | MoE-specific tests |
| `DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST` | — | Embedding tests |
| `DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST` | — | Vision-language tests |
---
## Test Placement
```
test/
├── registered/ # CI tests (auto-discovered by run_suite.py)
│ ├── sampling/ # test_penalty.py, test_sampling_params.py ...
│ ├── sessions/ # test_session_control.py ...
│ ├── openai_server/ # basic/, features/, validation/ ...
│ ├── spec/ # eagle/, utils/ ...
│ ├── models/ # model-specific accuracy tests
│ ├── perf/ # performance benchmarks
│ └── <category>/ # create new category if needed
├── manual/ # Non-CI: debugging, one-off, manual verification
└── run_suite.py # CI runner (scans registered/ only)
```
**Decision rule**: if the test should run in CI → `registered/`. If it's for local debugging or requires special hardware not in CI → `manual/`.
---
## Key Utilities
```python
from sglang.test.test_utils import (
CustomTestCase, # base class with retry logic
popen_launch_server, # launch server subprocess
DEFAULT_URL_FOR_TEST, # auto-configured base URL
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, # 600s default
run_bench_serving, # benchmark helper (launch + bench)
)
from sglang.srt.utils import kill_process_tree # cleanup server
```
---
## Checklist
Before submitting a test:
- [ ] Inherits from `CustomTestCase` (not `unittest.TestCase`)
- [ ] Has `register_*_ci(...)` call at module level
- [ ] Placed in `test/registered/<category>/`
- [ ] Model selection: smallest for model-agnostic features, 8B for general perf, case-by-case for other complex features
- [ ] `setUpClass` launches server, `tearDownClass` kills it
- [ ] Has `if __name__ == "__main__": unittest.main(verbosity=3)`
- [ ] `est_time` is reasonable (measure locally)
================================================
FILE: .codespellrc
================================================
[codespell]
ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS
skip = *.json,*.jsonl,*.patch,*.txt
================================================
FILE: .coveragerc
================================================
[run]
source = python/sglang/srt
omit =
*/test/*
*/__pycache__/*
[report]
show_missing = true
exclude_lines =
pragma: no cover
if __name__ == .__main__.:
raise NotImplementedError
if TYPE_CHECKING
[html]
directory = htmlcov
================================================
FILE: .devcontainer/Dockerfile
================================================
FROM lmsysorg/sglang:dev
# Create non-root user with specified UID and GID
# NOTE: Replace with your own UID and GID. This is a workaround from https://github.com/microsoft/vscode-remote-release/issues/49#issuecomment-489060908.
ARG HOST_UID=1003
ARG HOST_GID=1003
RUN groupadd -g $HOST_GID devuser && \
useradd -m -u $HOST_UID -g $HOST_GID -s /bin/zsh devuser
# Give devuser sudo access
RUN apt-get update && apt-get install -y sudo && \
echo "devuser ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/devuser && \
rm -rf /var/lib/apt/lists/* && \
apt-get clean
# Set up oh-my-zsh for devuser
RUN cp -r /root/.oh-my-zsh /home/devuser/.oh-my-zsh && \
cp /root/.zshrc /home/devuser/.zshrc && \
cp /root/.vimrc /home/devuser/.vimrc && \
cp /root/.tmux.conf /home/devuser/.tmux.conf && \
sed -i 's|/root/.oh-my-zsh|/home/devuser/.oh-my-zsh|g' /home/devuser/.zshrc && \
chown -R devuser:devuser /home/devuser/
# Set workspace directory and ownership
WORKDIR /sgl-workspace/sglang
RUN chown -R devuser:devuser /sgl-workspace
# Switch to devuser
USER devuser
# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
# Install rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
================================================
FILE: .devcontainer/devcontainer.json
================================================
{
"name": "sglang",
"build": {
"dockerfile": "Dockerfile"
},
"remoteUser": "devuser",
"customizations": {
"vscode": {
"extensions": [
// Python development
"ms-python.python",
"charliermarsh.ruff",
// Rust development
"rust-lang.rust-analyzer",
"tamasfe.even-better-toml"
]
}
},
"forwardPorts": [],
"runArgs": [
"--gpus",
"all"
],
// The two lines below ensures that your local changes in the sglang
// repo is automatically synced to the sglang pip package installed
// in the dev docker container. You can remove / comment out these
// two lines if you prefer to sync code changes manually.
"workspaceMount": "source=${localWorkspaceFolder},target=/sgl-workspace/sglang,type=bind",
"workspaceFolder": "/sgl-workspace/sglang"
}
================================================
FILE: .github/CI_PERMISSIONS.json
================================================
{
"1pikachu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Alcanderian": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"AniZpZ": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"BBuf": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"BHZ-BER": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"ByronHsu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"CaoE": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"CatherineSue": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"Chen-0210": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"ClawSeven": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ConnorLi96": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"DarkSharpness": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Edwardf0t1": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"FlamingoPg": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"FrankLeeeee": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Fridge003": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"HaiShaw": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"HanHan009527": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"HandH1998": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Hanrui-Wang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"HydraQYH": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"JeremieMelo": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Johnsonms": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"JustinTong0323": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"Kangyan-Zhou": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"LorrinWWW": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"MingxuZh": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Oasis-Git": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"Prozac614": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"Qiaolin-Yu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"Qihang-Zhang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"Ratish1": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"RubiaCx": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"ShangmingCai": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"Shunkangz": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"SimonCqk": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"TianQiLin666666": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"Ubospica": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"Valentine233": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Xia-Weiwen": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"XiaotongJiang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"XucSh": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"YAMY1234": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"Ying1123": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ZailiWang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ZhengWG": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"ZhengdQin": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"acelyc111": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"adarshxs": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"airMeng": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"alisonshao": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"alphabetc1": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"amysaq2023": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"attack204": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ayrnb": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"azhurkevich": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"b8zhong": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"blzheng": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"byjiang1996": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"cctry": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ch-wan": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"chunyuan-w": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"cicirori": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"cyb70289": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"dongjiyingdjy": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"dougyster": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"elfiegg": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"fy1214": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"fzyzcjy": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"gaopengff": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"gongwei-130": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"gongy": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"guapisolo": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"guoyuhong": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"hanming-lu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"harrisonlimh": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"harvenstar": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"hebiao064": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"hlu1": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"hnyls2002": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"huaiyuzh": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"huangtingwei9988": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"hubertlu-tw": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"hyhieu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"hzh0425": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"iforgetmyname": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"ishandhanani": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"ispobock": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"jason-fxz": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"jasperjiaguo": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"jhinpan": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"jianan-gu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"jinleic": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"jinmingyi1998": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"kaixih": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"kevin85421": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"key4ng": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"kkHuang-amd": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"kpham-sgl": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"kssteven418": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"kushanam": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"lanking520": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"lifuhuang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"liusy58": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"liz-badada": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"merrymercy": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"mickqian": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"mingfeima": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"minleminzui": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"mmangkad": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"narutolhy": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"netanel-haber": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"nvcastet": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ocss884": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"pansicheng": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"pavanimajety": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"pdasgup": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"ping1jing2": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"pranavm-nvidia": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"pyc96": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"qingquansong": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"qywu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"rainj-me": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"ravi03071991": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"rkooo567": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"saienduri": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"samuellees": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"scottjlee": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"sglang-bot": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"sglang-npu-bot": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"shaharmor98": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"shanyu-sys": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"shuaills": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"sleepcoo": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"slin1237": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"stmatengss": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"strgrb": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"sufeng-buaa": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"sundar24295s": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"sunjiweiswift": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"sunxxuns": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"thecodingwizard": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"timmy-feng": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"trevor-m": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"vincentzed": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"wenscarl": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"whybeyoung": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"wisclmy0611": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"xiezhq-hermann": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"xutizhou": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"xyjixyjixyji": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"yanbing-j": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"yangsijia-serena": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"yeahdongcn": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"yhyang201": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"yilian49": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"yinghai": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"yingluosanqian": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"yizhang2077": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"ykcombat": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"ynwang007": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"yuan-luo": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"yundai424": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"yushengsu-thu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"yyihuang": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"yzh119": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"zhaochenyang20": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
},
"zhijian-liu": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"zhuzilin": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"zhyncs": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"zminglei": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
"can_rerun_stage": true
},
"zyksir": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override",
"can_rerun_stage": true
},
"zyzshishui": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "custom override",
"can_rerun_stage": true
}
}
================================================
FILE: .github/CODEOWNERS
================================================
.github @merrymercy @Fridge003 @ispobock @Kangyan-Zhou @bingxche
/docker @Fridge003 @ispobock @HaiShaw @ishandhanani @yctseng0211
/docker/npu.Dockerfile @ping1jing2 @iforgetmyname
/python/pyproject.toml @merrymercy @Fridge003 @ispobock
/python/sglang/jit_kernel @DarkSharpness @BBuf @celve @HydraQYH @yuan-luo
/python/sglang/jit_kernel/diffusion @yingluosanqian @BBuf @mickqian
/python/sglang/multimodal_gen @mickqian @yhyang201 @ping1jing2
/python/sglang/multimodal_gen/runtime/cache @DefTruth
/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf @yingluosanqian @ping1jing2
/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf @yingluosanqian @ping1jing2
/python/sglang/srt/batch_invariant_ops @Fridge003 @hebiao064
/python/sglang/srt/constrained @hnyls2002 @DarkSharpness
/python/sglang/srt/compilation @hebiao064
/python/sglang/srt/disaggregation @ByronHsu @hnyls2002 @ShangmingCai
/python/sglang/srt/disaggregation/ascend @ping1jing2 @iforgetmyname
/python/sglang/srt/distributed @yizhang2077 @merrymercy @ch-wan
/python/sglang/srt/distributed/device_communicators/mooncake_transfer_engine.py @ShangmingCai @stmatengss
/python/sglang/srt/dllm @ClawSeven @btw616
/python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237 @merrymercy @JustinTong0323
/python/sglang/srt/entrypoints/grpc_server.py @CatherineSue @slin1237
/python/sglang/srt/eplb @fzyzcjy @ch-wan
/python/sglang/srt/function_call @CatherineSue @JustinTong0323
/python/sglang/srt/grpc @CatherineSue @slin1237
/python/sglang/srt/hardware_backend/npu @ping1jing2 @iforgetmyname
/python/sglang/srt/hardware_backend/npu/quantization @OrangeRedeng @TamirBaydasov @iforgetmyname
/python/sglang/srt/layers @merrymercy @Ying1123 @Fridge003 @ispobock @HaiShaw @ch-wan @BBuf @Edwardf0t1
/python/sglang/srt/layers/attention @merrymercy @Fridge003 @ispobock @Qiaolin-Yu @hebiao064 @HaiShaw
/python/sglang/srt/layers/attention/fla @yizhang2077 @hebiao064
/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py @yizhang2077 @hebiao064 @hanming-lu
/python/sglang/srt/layers/attention/mamba @yizhang2077 @hebiao064
/python/sglang/srt/layers/attention/nsa @1am9trash @hubertlu-tw @kkHuang-amd @HaiShaw @Fridge003 @hlu1 @rainj-me
/python/sglang/srt/layers/attention/vision.py @mickqian @yuan-luo @yhyang201
/python/sglang/srt/layers/quantization @ch-wan @BBuf @Edwardf0t1 @FlamingoPg @AniZpZ @HaiShaw @b8zhong
/python/sglang/srt/layers/quantization/quark @kkHuang-amd @yichiche @hubertlu-tw @1am9trash @BowenBao
/python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang @yushengsu-thu
/python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
/python/sglang/srt/managers/scheduler_pp_mixin.py @ShangmingCai @XucSh
/python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann @hanming-lu @yizhang2077 @hzh0425 @ispobock
/python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @Fridge003 @ispobock
/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py @hebiao064
/python/sglang/srt/models/deepseek_common @Fridge003 @ispobock @fzyzcjy @ch-wan
/python/sglang/srt/models/deepseek_v2.py @fzyzcjy @zhyncs @ispobock @ch-wan @merrymercy @Fridge003
/python/sglang/srt/multimodal @mickqian @JustinTong0323 @yhyang201 @yuan-luo
/python/sglang/srt/observability @merrymercy @fzyzcjy @sufeng-buaa
/python/sglang/srt/ray @Qiaolin-Yu @xyuzh
/python/sglang/srt/speculative @Ying1123 @merrymercy @hnyls2002
/sgl-kernel @ispobock @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw
/sgl-model-gateway @slin1237 @CatherineSue
/sgl-model-gateway/benches @slin1237
/sgl-model-gateway/bindings/python @CatherineSue @key4ng @slin1237
/sgl-model-gateway/e2e_test @CatherineSue @key4ng
/sgl-model-gateway/src/config @slin1237
/sgl-model-gateway/src/core @slin1237
/sgl-model-gateway/src/data_connector @key4ng
/sgl-model-gateway/src/grpc_client @CatherineSue @slin1237
/sgl-model-gateway/src/mcp @key4ng @slin1237
/sgl-model-gateway/src/policies @slin1237 @ByronHsu
/sgl-model-gateway/src/proto @CatherineSue @slin1237
/sgl-model-gateway/src/protocols @CatherineSue @key4ng
/sgl-model-gateway/src/reasoning_parser @CatherineSue
/sgl-model-gateway/src/routers @CatherineSue @key4ng @slin1237
/sgl-model-gateway/src/tokenizer @slin1237 @CatherineSue
/sgl-model-gateway/src/tool_parser @slin1237 @CatherineSue
/sgl-model-gateway/src/wasm @slin1237
/sgl-model-gateway/examples/wasm @slin1237
/test/srt/ascend @ping1jing2 @iforgetmyname
/test/srt/test_modelopt* @Edwardf0t1
================================================
FILE: .github/FOLDER_README.md
================================================
# Maintenance Tools
This folder contains tools and workflows for automating maintenance tasks.
## CI Permissions
`CI_PERMISSIONS.json` defines the CI permissions granted to each user.
Maintainers can directly edit the file to add entries with `"reason": "custom override"`.
Maintainers can also run `update_ci_permission.py` to update it with some auto rules (e.g., top contributors in the last 90 days get full permissions).
## Others
- `MAINTAINER.md` defines the code maintenance model.
================================================
FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml
================================================
name: 🐞 Bug report
description: Report a bug to help us reproduce and fix it.
title: "[Bug] "
labels: ['Bug']
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: I searched related issues but found no solution.
- label: The bug persists in the latest version.
- label: Issues without environment info and a minimal reproducible demo are hard to resolve and may receive no feedback.
- label: If this is not a bug report but a general question, please start a discussion at https://github.com/sgl-project/sglang/discussions. Otherwise, it will be closed.
- label: Please use English. Otherwise, it will be closed.
- type: textarea
attributes:
label: Describe the bug
description: A clear, concise description of the bug.
validations:
required: true
- type: textarea
attributes:
label: Reproduction
description: Command/script run and model used.
placeholder: Paste the command here.
validations:
required: true
- type: textarea
attributes:
label: Environment
description: Run `python3 -m sglang.check_env` and paste output here. Issues without this will be closed.
placeholder: Paste environment output here.
validations:
required: true
================================================
FILE: .github/ISSUE_TEMPLATE/2-feature-request.yml
================================================
name: 🚀 Feature request
description: Suggest an idea for this project
title: "[Feature] "
body:
- type: checkboxes
attributes:
label: Checklist
options:
- label: If this is not a feature request but a general question, please start a discussion at https://github.com/sgl-project/sglang/discussions. Otherwise, it will be closed.
- label: Please use English. Otherwise, it will be closed.
- type: textarea
attributes:
label: Motivation
description: |
Clearly and concisely describe the feature's motivation.
validations:
required: true
- type: textarea
attributes:
label: Related resources
description: |
Provide official releases or third-party implementations if available.
================================================
FILE: .github/MAINTAINER.md
================================================
# SGLang Code Maintenance Model
This document describes the code maintenance model for the SGLang project.
Since SGLang is a large project involving multiple organizations and hardware platforms, we designed this model with the following goals:
- Ensure a responsive and smooth review process.
- Allow for fast iteration, so maintainers can sometimes bypass flaky CI tests for important PRs.
## Role Descriptions
There are four roles in this maintenance model. Some are custom roles, while others are predefined by GitHub.
- **Merge Oncall**: The person who drives the PR merge process. They have strong area-specific expertise and uphold a high bar for code quality.
- Permission: Merge PRs. Bypass branch protection rules if needed.
- Responsibility: Shepherd the merge of PRs assigned to their area. Revert or hotfix any issues related to their merge (especially if they bypass).
- **Codeowner**: The person who protects critical code. Without a bypass, each PR needs at least one Codeowner approval for each modified file protected by [CODEOWNERS](./CODEOWNERS). Please note that this role is not an honor but a significant responsibility because PRs cannot be merged without your approval (except when bypassed by a Merge Oncall).
- Permission: Approve PRs, allowing them to be merged without a bypass.
- Responsibility: Review PRs in a timely manner.
- **Write**: A person with write permission to the SGLang repo.
- Permission: Merge PRs if they have passed required tests and been approved by Codeowners. This role cannot bypass branch protection rules.
- Responsibility: Review and merge PRs in a timely manner.
- **CI Oncall**: A person who manages CI runners for specific hardware platforms.
- Permission: Add CI runners.
- Responsibility: Keep the CI runners up and running.
__Note__: Difference between Merge Oncall and Codeowner
- The Merge Oncall is an active role held by someone who actively tries to help merge PRs and can bypass CI if needed.
- The Codeowner is a passive protection role provided by GitHub; it prevents accidental changes to critical code.
- The list of Merge Oncalls is attached below. The list of Codeowners is in the [CODEOWNERS](./CODEOWNERS) file.
__Note__: The permissions to trigger CI tests are defined separately according to these [rules](https://docs.sglang.io/developer_guide/contribution_guide.html#how-to-trigger-ci-tests).
## Pull Request Merge Process
1. The author submits a pull request (PR) and fills out the PR checklist.
2. A bot assigns this PR to a Merge Oncall and @-mentions them. At the same time, GitHub will automatically request reviews from Codeowners.
3. Someone tags the PR with a `run-ci` label ([help](https://docs.sglang.io/developer_guide/contribution_guide.html#how-to-trigger-ci-tests)). Then the author can trigger CI by pushing new commits.
4. The Merge Oncall coordinates the review (e.g., asking people to review) and approves the PR; the Codeowners also approve the PR. If the assigned Merge Oncall is not responsive, the author can ping other related Merge Oncalls and Reviewers in the list below.
5. The code can now be merged:
- **Ideal case:** For each modified file, one Codeowner has approved the PR. The PR has also passed the required CI tests. Then, anyone with write permission can merge the PR.
- **Exception:** In cases where it is difficult to meet all requirements (due to flaky CI or slow responses), a Merge Oncall can bypass branch protection to merge the PR.
If you meet any issues during the merge, you can discuss in [slack channels](https://slack.sglang.io/): #dev, #pull-request, and #ci-cd-build-release.
## The List of Merge Oncalls and Reviewers
The format is @github-username (Slack username).
TODO: fill in the list.
Now we have many Merge Oncalls mainly because the CI is flaky and the CODEOWNERS is too coarse-grained.
In the future, we hope the CI can be improved and we only need bypass rarely. After that, most Merge Oncalls can be converted back to Write and CODEOWNERS.
This list is based on the current situation. If you or someone you know would like to take on more responsibility and are qualified, please ping @Lianmin Zheng and @Ying Sheng in the Slack channel. They will start a nomination and internal review process.
## The List of CI Oncalls
The format is @github-username (Slack username).
### NVIDIA GPUs
@merrymercy (Lianmin Zheng), @Kangyan-Zhou (Kangyan Zhou), @ch-wan (Cheng Wan), @HanHan009527 (hanhan), @ishandhanani (Ishan Dhanani), @key4ng (Keyang Ru), @slin1237 (Simo Lin), @ShangmingCai (Shangming Cai)
### AMD GPUs
@saienduri (Sai Enduri), @HaiShaw (Henry HAI)
### Intel CPU and XPU
@mingfeima (Mingfei Ma), @DiweiSun (Diwei Sun)
### Ascend NPUs
@iforgetmyname (Even Zhou)
This list is based on the current situation. If you or someone you know would like to donate machines for CI, they can serve as the CI oncalls for their machines. Please ping @Lianmin Zheng and @Ying Sheng in the Slack channel. They will start a nomination and internal review process.
================================================
FILE: .github/actions/upload-cuda-coredumps/action.yml
================================================
name: Upload CUDA Coredumps
description: Upload CUDA coredump files as artifacts and clean up the directory.
inputs:
artifact-suffix:
description: Suffix appended to the artifact name (e.g. matrix partition id)
required: false
default: ""
retention-days:
description: Number of days to retain the artifact
required: false
default: "7"
runs:
using: composite
steps:
- name: Upload CUDA coredumps
uses: actions/upload-artifact@v4
with:
name: cuda-coredumps-${{ github.job }}${{ inputs.artifact-suffix && format('-{0}', inputs.artifact-suffix) }}
path: ${{ env.SGLANG_CUDA_COREDUMP_DIR || '/tmp/sglang_cuda_coredumps' }}/
retention-days: ${{ inputs.retention-days }}
if-no-files-found: ignore
- name: Cleanup CUDA coredumps
shell: bash
run: rm -rf "${{ env.SGLANG_CUDA_COREDUMP_DIR || '/tmp/sglang_cuda_coredumps' }}"
================================================
FILE: .github/actions/wait-for-jobs/action.yml
================================================
name: Wait for Jobs
description: Poll and wait for specified jobs in the current workflow run to complete
inputs:
stage-name:
description: 'Human-readable stage name for log messages (e.g. "stage-a")'
required: true
jobs:
description: |
JSON array of job specs to wait for. Each element is either:
- a string: exact job name (e.g. "stage-a-test-1")
- an object { "prefix": "...", "expected_count": N }: for matrix jobs
required: true
max-wait-minutes:
description: 'Maximum time to wait before timing out'
required: false
default: '240'
poll-interval-seconds:
description: 'Seconds between polling attempts'
required: false
default: '120'
github-token:
description: 'GitHub token for API calls'
required: false
default: ${{ github.token }}
outputs:
result:
description: 'Overall result: success, failure, or timeout'
value: ${{ steps.wait.outputs.result }}
runs:
using: composite
steps:
- name: Wait for jobs to complete
id: wait
uses: actions/github-script@v7
env:
INPUT_STAGE_NAME: ${{ inputs.stage-name }}
INPUT_JOBS: ${{ inputs.jobs }}
INPUT_MAX_WAIT_MINUTES: ${{ inputs.max-wait-minutes }}
INPUT_POLL_INTERVAL_SECONDS: ${{ inputs.poll-interval-seconds }}
with:
github-token: ${{ inputs.github-token }}
script: |
const stageName = process.env.INPUT_STAGE_NAME;
const jobSpecs = JSON.parse(process.env.INPUT_JOBS);
const maxWaitMinutes = parseInt(process.env.INPUT_MAX_WAIT_MINUTES);
const pollIntervalSeconds = parseInt(process.env.INPUT_POLL_INTERVAL_SECONDS);
const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds;
// Normalize job specs into a uniform format
const normalizedSpecs = jobSpecs.map(spec => {
if (typeof spec === 'string') {
return { prefix: spec, expected_count: 1, exact: true };
}
return { ...spec, exact: false };
});
const totalExpectedJobs = normalizedSpecs.reduce((sum, s) => sum + s.expected_count, 0);
// Match job name: exact match or prefix + " (" for matrix jobs
const matchesSpec = (jobName, spec) => {
if (spec.exact) {
return jobName === spec.prefix;
}
return jobName === spec.prefix || jobName.startsWith(spec.prefix + ' (');
};
for (let attempt = 0; attempt < maxAttempts; attempt++) {
const jobs = await github.paginate(github.rest.actions.listJobsForWorkflowRun, {
owner: context.repo.owner,
repo: context.repo.repo,
run_id: context.runId,
per_page: 100,
});
let allCompleted = true;
let failedJobs = [];
let completedCount = 0;
let totalCount = 0;
for (const spec of normalizedSpecs) {
const matchingJobs = jobs.filter(job => matchesSpec(job.name, spec));
for (const job of matchingJobs) {
totalCount++;
console.log(`${job.name}: status=${job.status}, conclusion=${job.conclusion}`);
if (job.status === 'completed') {
completedCount++;
if (job.conclusion !== 'success' && job.conclusion !== 'skipped') {
failedJobs.push(job.name);
}
} else {
allCompleted = false;
}
}
if (matchingJobs.length < spec.expected_count) {
console.log(`${spec.prefix}: found ${matchingJobs.length}/${spec.expected_count} jobs (waiting for more)`);
allCompleted = false;
}
}
console.log(`[${stageName}] Progress: ${completedCount}/${totalCount} jobs completed (expected ${totalExpectedJobs})`);
// Fail fast if any jobs failed
if (failedJobs.length > 0) {
core.setOutput('result', 'failure');
core.setFailed(`${stageName} jobs failed: ${failedJobs.join(', ')}`);
return;
}
if (allCompleted && totalCount >= totalExpectedJobs) {
core.setOutput('result', 'success');
return;
}
console.log(`Waiting ${pollIntervalSeconds}s... (attempt ${attempt + 1}/${maxAttempts})`);
await new Promise(resolve => setTimeout(resolve, pollIntervalSeconds * 1000));
}
core.setFailed(`Timeout waiting for ${stageName} jobs`);
core.setOutput('result', 'timeout');
================================================
FILE: .github/labeler.yml
================================================
# Configuration for the GitHub Labeler action
# Automatically adds labels to PRs based on the files changed
# Router specific (Rust code in sgl-model-gateway)
model-gateway:
- changed-files:
- any-glob-to-any-file: 'sgl-model-gateway/**/*'
# Kernel specific
sgl-kernel:
- changed-files:
- any-glob-to-any-file: 'sgl-kernel/**/*'
# JIT kernel specific
jit-kernel:
- changed-files:
- any-glob-to-any-file: 'python/sglang/jit_kernel/**/*'
# Documentation
documentation:
- changed-files:
- any-glob-to-any-file:
- '**/*.md'
- 'docs/**/*'
- 'README*'
# Dependencies
dependencies:
- changed-files:
- any-glob-to-any-file:
- '**/requirements*.txt'
- '**/Cargo.toml'
- '**/Cargo.lock'
- '**/pyproject*.toml'
- '**/setup.py'
- '**/poetry.lock'
- '**/package.json'
- '**/package-lock.json'
# Multi-modal
Multi-modal:
- changed-files:
- any-glob-to-any-file:
- '**/*multimodal*'
- '**/*vision*'
- '**/*vlm*'
# Diffusion
diffusion:
- changed-files:
- any-glob-to-any-file: 'python/sglang/multimodal_gen/**/*'
# LoRA
lora:
- changed-files:
- any-glob-to-any-file:
- '**/*lora*'
# Quantization
quant:
- changed-files:
- any-glob-to-any-file:
- '**/*quant*'
- '**/*quantization*'
# Speculative decoding
speculative-decoding:
- changed-files:
- any-glob-to-any-file:
- '**/*speculative*'
# AMD specific
amd:
- changed-files:
- any-glob-to-any-file:
- '**/*amd*'
- '**/*rocm*'
# NPU specific
npu:
- changed-files:
- any-glob-to-any-file:
- '**/*npu*'
- '**/*ascend*'
# Blackwell
blackwell:
- changed-files:
- any-glob-to-any-file:
- '**/*nvfp4*'
- 'sgl-kernel/csrc/attention/cutlass_sm100_mla/**/*'
- 'python/sglang/srt/layers/attention/trtllm_mla_backend.py'
- 'python/sglang/srt/layers/attention/trtllm_mha_backend.py'
# DeepSeek specific
deepseek:
- changed-files:
- any-glob-to-any-file:
- '**/*deepseek*'
# HiCache
hicache:
- changed-files:
- any-glob-to-any-file:
- '**/*hicache*'
# Deterministic
deterministic:
- changed-files:
- any-glob-to-any-file: 'python/sglang/srt/batch_invariant_ops/**/*'
# Piecewise CUDA Graph
piecewise-cuda-graph:
- changed-files:
- any-glob-to-any-file: 'python/sglang/srt/compilation/**/*'
# Moore Threads specific
mthreads:
- changed-files:
- any-glob-to-any-file:
- '**/*mthreads*'
- '**/*musa*'
================================================
FILE: .github/pull_request_template.md
================================================
<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclear, submit your PR and reach out to maintainers for assistance. Join our Slack community at https://slack.sglang.io to discuss further. -->
## Motivation
<!-- Describe the purpose and goals of this pull request. -->
## Modifications
<!-- Detail the changes made in this pull request. -->
## Accuracy Tests
<!-- If this pull request affects model outputs (e.g., changes to the kernel or model forward code), provide accuracy test results. -->
## Benchmarking and Profiling
<!-- If this pull request impacts inference speed, provide benchmarking and profiling results. -->
## Checklist
- [ ] Format your code according to the [Format code with pre-commit](https://docs.sglang.io/developer_guide/contribution_guide.html#format-code-with-pre-commit).
- [ ] Add unit tests according to the [Run and add unit tests](https://docs.sglang.io/developer_guide/contribution_guide.html#run-and-add-unit-tests).
- [ ] Update documentation according to [Write documentations](https://docs.sglang.io/developer_guide/contribution_guide.html#write-documentations).
- [ ] Provide accuracy and speed benchmark results according to [Test the accuracy](https://docs.sglang.io/developer_guide/contribution_guide.html#test-the-accuracy) and [Benchmark the speed](https://docs.sglang.io/developer_guide/contribution_guide.html#benchmark-the-speed).
- [ ] Follow the SGLang code style [guidance](https://docs.sglang.io/developer_guide/contribution_guide.html#code-style-guidance).
## Review Process
1. Ping Merge Oncalls to start the PR flow. See the [PR Merge Process](https://github.com/sgl-project/sglang/blob/main/.github/MAINTAINER.md#pull-request-merge-process).
2. Get approvals from [CODEOWNERS](https://github.com/sgl-project/sglang/blob/main/.github/CODEOWNERS) and other reviewers.
3. Trigger CI tests with [comments](https://docs.sglang.io/developer_guide/contribution_guide.html#how-to-trigger-ci-tests) or contact authorized users to do so.
- `/tag-run-ci-label`, `/rerun-failed-ci`, `/tag-and-rerun-ci`
4. After green CI and required approvals, ask Merge Oncalls to merge.
================================================
FILE: .github/update_ci_permission.py
================================================
"""
Update the CI permissions configuration file.
This script updates the `CI_PERMISSIONS.json` file, which defines the CI permissions granted to each user.
The format of `CI_PERMISSIONS.json` is as follows:
{
"username1": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 0,
"reason": "top contributor"
},
"username2": {
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override"
}
}
Permissions are assigned according to the following rules:
1. Add the top 50 contributors from the last 90 days with full permissions, no cooldown, and the reason "top contributor".
2. Load all users from the existing `CI_PERMISSIONS.json` file and update their entries as follows:
- If a user is already covered by rule 1, skip that user.
- If the old reason of a user is "top contributor" but they are not in the current top contributors list, change their configuration to:
{
"can_tag_run_ci_label": true,
"can_rerun_failed_ci": true,
"cooldown_interval_minutes": 60,
"reason": "custom override"
}
- For all other cases, preserve the original configuration unchanged.
3. All other users receive no permissions and a 120-minute cooldown (they are omitted from the file).
Usage:
export GH_TOKEN="your_github_token"
python3 update_ci_permission.py
# Sort-only mode (no network calls, no GH_TOKEN required)
python3 update_ci_permission.py --sort-only
"""
import argparse
import json
import os
from collections import Counter
from datetime import datetime, timedelta, timezone
try:
import requests
except ImportError:
requests = None # Only needed for non-sort-only runs
# Configuration
REPO_OWNER = "sgl-project"
REPO_NAME = "sglang"
FILE_NAME = os.path.join(os.path.dirname(__file__), "CI_PERMISSIONS.json")
HEADERS = {}
def github_api_get(endpoint, params=None):
"""Helper to make paginated GitHub API requests."""
if requests is None:
raise RuntimeError(
"The requests package is required. Install it or use --sort-only."
)
if not HEADERS:
raise RuntimeError(
"GitHub headers not initialized. Set GH_TOKEN or use --sort-only."
)
results = []
url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/{endpoint}"
while url:
response = requests.get(url, headers=HEADERS, params=params)
if response.status_code != 200:
print(f"Error fetching {url}: {response.status_code} {response.text}")
# If we fail to fetch, strictly return what we have or empty to avoid crashing logic
break
data = response.json()
if isinstance(data, list):
results.extend(data)
else:
return data # Non-list response (not paginated usually)
# Handle pagination
url = None
if "link" in response.headers:
links = response.headers["link"].split(", ")
for link in links:
if 'rel="next"' in link:
url = link[link.find("<") + 1 : link.find(">")]
params = None # Params are included in the next link
break
return results
def get_write_access_users():
"""Fetches users with push (write) or admin access."""
print("Fetching collaborators with write access...")
# Note: This endpoint usually requires admin rights on the token.
collaborators = github_api_get("collaborators", params={"per_page": 100})
writers = set()
for col in collaborators:
perms = col.get("permissions", {})
# Check for admin, maintain, or push rights
if perms.get("admin") or perms.get("maintain") or perms.get("push"):
writers.add(col["login"])
print(f"Found {len(writers)} users with write access.")
return writers
def get_top_contributors(days=90, limit=50):
"""Fetches top contributors based on commit count in the last N days."""
print(f"Fetching commits from the last {days} days...")
since_date = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat()
# Fetch commits
commits = github_api_get("commits", params={"since": since_date, "per_page": 100})
author_counts = Counter()
for commit in commits:
# commit['author'] contains the GitHub user object (can be None if not linked)
if commit.get("author") and "login" in commit["author"]:
author_counts[commit["author"]["login"]] += 1
top_users = [user for user, _ in author_counts.most_common(limit)]
print(f"Found {len(top_users)} active contributors in the last {days} days.")
return set(top_users)
def load_existing_permissions():
if os.path.exists(FILE_NAME):
try:
with open(FILE_NAME, "r") as f:
return json.load(f)
except json.JSONDecodeError:
print(f"Warning: {FILE_NAME} is invalid JSON. Starting fresh.")
return {}
def sort_permissions_file():
"""Sort the existing CI permissions file alphabetically and exit."""
if not os.path.exists(FILE_NAME):
print(f"{FILE_NAME} not found. Nothing to sort.")
return
old_permissions = load_existing_permissions()
sorted_permissions = dict(sorted(old_permissions.items()))
with open(FILE_NAME, "w") as f:
json.dump(sorted_permissions, f, indent=4)
f.write("\n")
print(f"Sorted {FILE_NAME}. Total users: {len(sorted_permissions)}")
def main():
parser = argparse.ArgumentParser(description="Update or sort CI permissions.")
parser.add_argument(
"--sort-only",
action="store_true",
help="Only sort CI_PERMISSIONS.json alphabetically without fetching data.",
)
args = parser.parse_args()
if args.sort_only:
sort_permissions_file()
return
gh_token = os.getenv("GH_TOKEN")
if not gh_token:
raise ValueError("Error: GH_TOKEN environment variable is not set.")
global HEADERS
HEADERS = {
"Authorization": f"Bearer {gh_token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
# Gather Data
try:
write_access_users = get_write_access_users()
except Exception as e:
print(f"Warning: Could not fetch collaborators (check token scope). Error: {e}")
write_access_users = set()
top_contributors = get_top_contributors(days=90, limit=50)
old_permissions = load_existing_permissions()
new_permissions = {}
# Rule 1: Add Top 50 Contributors
for user in top_contributors:
new_permissions[user] = {
"can_tag_run_ci_label": True,
"can_rerun_failed_ci": True,
"cooldown_interval_minutes": 0,
"reason": "top contributor",
}
# Rule 2: Process Existing Users (Merge Logic)
for user, config in old_permissions.items():
if user in new_permissions:
# Already handled by Rule 1 or 2
continue
old_reason = config.get("reason", "")
# If they fell off the top contributor list
if old_reason in ["top contributor"]:
new_permissions[user] = {
"can_tag_run_ci_label": True,
"can_rerun_failed_ci": True,
"cooldown_interval_minutes": 60,
"reason": "custom override",
}
else:
# Preserve custom overrides
new_permissions[user] = config
# Save and Sort
# Sorting keys for cleaner diffs
sorted_permissions = dict(sorted(new_permissions.items()))
with open(FILE_NAME, "w") as f:
json.dump(sorted_permissions, f, indent=4)
f.write("\n") # Add trailing newline
print(f"Successfully updated {FILE_NAME}. Total users: {len(sorted_permissions)}")
if __name__ == "__main__":
main()
================================================
FILE: .github/workflows/amd-aiter-scout.yml
================================================
name: AMD AITER Scout
on:
schedule:
- cron: '0 20 * * 1' # Monday 20:00 UTC
- cron: '0 20 * * 4' # Thursday 20:00 UTC
workflow_dispatch:
inputs:
aiter_ref:
description: 'AITER git ref (branch, tag, or SHA). Default: main (latest commit)'
required: false
type: string
default: 'main'
job_filter:
description: 'Comma-separated workflows to run: nightly-amd, nightly-amd-rocm720, pr-test-amd, pr-test-amd-rocm720. Default: all'
required: false
type: string
default: 'all'
continue_on_error:
description: 'Continue running other workflows even if one fails'
required: false
type: boolean
default: true
concurrency:
group: amd-aiter-scout-${{ github.run_id }}
cancel-in-progress: true
jobs:
resolve-aiter:
runs-on: ubuntu-latest
outputs:
aiter_sha: ${{ steps.resolve.outputs.sha }}
run_nightly_amd: ${{ steps.parse.outputs.run_nightly_amd }}
run_nightly_amd_rocm720: ${{ steps.parse.outputs.run_nightly_amd_rocm720 }}
run_pr_test_amd: ${{ steps.parse.outputs.run_pr_test_amd }}
run_pr_test_amd_rocm720: ${{ steps.parse.outputs.run_pr_test_amd_rocm720 }}
steps:
- name: Resolve AITER commit
id: resolve
run: |
REF="${{ inputs.aiter_ref || 'main' }}"
echo "Resolving AITER ref: ${REF}"
SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/heads/${REF}" | head -1 | cut -f1)
if [ -z "$SHA" ]; then
SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/tags/${REF}" | head -1 | cut -f1)
fi
if [ -z "$SHA" ]; then
SHA=$(git ls-remote https://github.com/ROCm/aiter.git "${REF}" | head -1 | cut -f1)
fi
if [ -z "$SHA" ]; then
SHA="${REF}"
fi
echo "sha=${SHA}" >> $GITHUB_OUTPUT
echo "### AITER Ref Resolution" >> $GITHUB_STEP_SUMMARY
echo "- **Requested ref:** \`${REF}\`" >> $GITHUB_STEP_SUMMARY
echo "- **Resolved SHA:** \`${SHA}\`" >> $GITHUB_STEP_SUMMARY
echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${SHA}" >> $GITHUB_STEP_SUMMARY
- name: Parse job filter
id: parse
run: |
FILTER="${{ inputs.job_filter || 'all' }}"
echo "Job filter: ${FILTER}"
if [[ "$FILTER" == "all" ]]; then
echo "run_nightly_amd=true" >> $GITHUB_OUTPUT
echo "run_nightly_amd_rocm720=true" >> $GITHUB_OUTPUT
echo "run_pr_test_amd=true" >> $GITHUB_OUTPUT
echo "run_pr_test_amd_rocm720=true" >> $GITHUB_OUTPUT
else
# Wrap with commas for exact substring matching (avoids "nightly-amd" matching "nightly-amd-rocm720")
PADDED=",${FILTER// /},"
echo "run_nightly_amd=$(echo "$PADDED" | grep -q ',nightly-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT
echo "run_nightly_amd_rocm720=$(echo "$PADDED" | grep -q ',nightly-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT
echo "run_pr_test_amd=$(echo "$PADDED" | grep -q ',pr-test-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT
echo "run_pr_test_amd_rocm720=$(echo "$PADDED" | grep -q ',pr-test-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT
fi
echo "### Job Filter" >> $GITHUB_STEP_SUMMARY
echo "- **Filter:** \`${FILTER}\`" >> $GITHUB_STEP_SUMMARY
call-nightly-amd:
if: needs.resolve-aiter.outputs.run_nightly_amd == 'true'
needs: resolve-aiter
uses: ./.github/workflows/nightly-test-amd.yml
secrets: inherit
with:
ref: ${{ github.sha }}
aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }}
job_filter: 'all'
continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }}
call-nightly-amd-rocm720:
if: needs.resolve-aiter.outputs.run_nightly_amd_rocm720 == 'true'
needs: resolve-aiter
uses: ./.github/workflows/nightly-test-amd-rocm720.yml
secrets: inherit
with:
ref: ${{ github.sha }}
aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }}
job_filter: 'all'
continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }}
call-pr-test-amd:
if: needs.resolve-aiter.outputs.run_pr_test_amd == 'true'
needs: resolve-aiter
uses: ./.github/workflows/pr-test-amd.yml
secrets: inherit
with:
run_all_tests: true
aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }}
continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }}
call-pr-test-amd-rocm720:
if: needs.resolve-aiter.outputs.run_pr_test_amd_rocm720 == 'true'
needs: resolve-aiter
uses: ./.github/workflows/pr-test-amd-rocm720.yml
secrets: inherit
with:
run_all_tests: true
aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }}
continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }}
check-all-jobs:
if: always()
needs:
- resolve-aiter
- call-nightly-amd
- call-nightly-amd-rocm720
- call-pr-test-amd
- call-pr-test-amd-rocm720
runs-on: ubuntu-latest
steps:
- name: Summary
run: |
echo "## AMD AITER Scout Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **AITER SHA:** \`${{ needs.resolve-aiter.outputs.aiter_sha }}\`" >> $GITHUB_STEP_SUMMARY
echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${{ needs.resolve-aiter.outputs.aiter_sha }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Workflow | Result |" >> $GITHUB_STEP_SUMMARY
echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY
echo "| Nightly AMD (AITER Latest) | \`${{ needs.call-nightly-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY
echo "| Nightly AMD ROCm 7.2 | \`${{ needs.call-nightly-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY
echo "| PR Test AMD (AITER Latest) | \`${{ needs.call-pr-test-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY
echo "| PR Test AMD ROCm 7.2 | \`${{ needs.call-pr-test-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY
- name: Check if any job failed
run: |
if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
echo "One or more workflows failed"
exit 1
fi
if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
echo "One or more workflows were cancelled"
exit 1
fi
echo "All workflows passed"
================================================
FILE: .github/workflows/amd-ci-job-monitor.yml
================================================
name: AMD CI Job Monitor
on:
schedule:
- cron: '0 0 * * *' # Daily at midnight UTC
pull_request:
paths:
- '.github/workflows/amd-ci-job-monitor.yml'
- 'scripts/ci/utils/query_job_status.py'
workflow_dispatch:
inputs:
hours:
description: 'Time window in hours'
required: false
default: '24'
type: string
job_filter:
description: 'Job name filter (leave empty for all AMD jobs)'
required: false
type: string
jobs:
# Single job filter mode
custom-report:
name: Custom Job Report
if: ${{ inputs.job_filter }}
runs-on: ubuntu-latest
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Generate Custom Job Report
timeout-minutes: 30
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ inputs.job_filter }}" \
--workflow "pr-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--summary
# Parse workflow files to get job names dynamically
parse-workflows:
name: Parse Workflow Jobs
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
outputs:
pr_jobs: ${{ steps.parse.outputs.pr_jobs }}
nightly_jobs: ${{ steps.parse.outputs.nightly_jobs }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Parse workflow files
id: parse
run: |
# Parse pr-test-amd.yml and extract job names (exclude utility jobs)
# Excluded: call-gate, check-changes, pr-test-amd-finish, cancel, check-all-jobs
pr_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/pr-test-amd.yml | \
grep -v -E '^(call-gate|check-changes|pr-test-amd-finish|cancel|check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "pr_jobs=$pr_jobs" >> $GITHUB_OUTPUT
echo "PR jobs: $pr_jobs"
# Parse nightly-test-amd.yml and extract job names (exclude utility jobs)
# Excluded: check-all-jobs
nightly_jobs=$(yq -r '.jobs | keys | .[]' .github/workflows/nightly-test-amd.yml | \
grep -v -E '^(check-all-jobs)$' | \
jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "nightly_jobs=$nightly_jobs" >> $GITHUB_OUTPUT
echo "Nightly jobs: $nightly_jobs"
# PR CI reports using dynamic matrix
pr-ci-reports:
name: PR - ${{ matrix.job_name }}
needs: parse-workflows
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.pr_jobs) }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Generate Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "pr-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--summary
# Nightly AMD test reports using dynamic matrix
nightly-reports:
name: Nightly - ${{ matrix.job_name }}
needs: parse-workflows
if: ${{ !inputs.job_filter }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
job_name: ${{ fromJson(needs.parse-workflows.outputs.nightly_jobs) }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install dependencies
run: pip install tabulate
- name: Generate Nightly Report
timeout-minutes: 15
run: |
python scripts/ci/utils/query_job_status.py \
--repo ${{ github.repository }} \
--job "${{ matrix.job_name }}" \
--workflow "nightly-test-amd.yml" \
--hours ${{ inputs.hours || '24' }} \
--summary
================================================
FILE: .github/workflows/auto-tune.yml
================================================
name: Auto tune
on:
workflow_dispatch:
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
================================================
FILE: .github/workflows/bot-bump-flashinfer-version.yml
================================================
name: Bot Bump Flashinfer Version
on:
workflow_dispatch:
inputs:
new_version:
description: 'New flashinfer version (e.g., 0.6.4)'
required: true
type: string
permissions:
contents: write
pull-requests: write
jobs:
bump-flashinfer-version:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python dependencies
run: |
pip install tomli
- name: Configure Git and branch
run: |
git config user.name "sglang-bot"
git config user.email "sglang-bot@users.noreply.github.com"
RANDOM_SUFFIX=$(echo $RANDOM | md5sum | head -c 4)
BRANCH_NAME="bot/bump-flashinfer-version-${{ github.event.inputs.new_version }}-${RANDOM_SUFFIX}"
git checkout -b "$BRANCH_NAME"
echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV
- name: Run flashinfer version bump script
run: |
python scripts/release/bump_flashinfer_version.py "${{ github.event.inputs.new_version }}"
- name: Commit and create PR
env:
GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
run: |
bash scripts/release/commit_and_pr.sh "flashinfer" "${{ github.event.inputs.new_version }}" "$BRANCH_NAME"
================================================
FILE: .github/workflows/bot-bump-kernel-version-to-sglang.yml
================================================
name: Bot Bump Kernel Version to SGLang
on:
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
bump-kernel-version-to-sglang:
runs-on: ubuntu-latest
outputs:
branch_name: ${{ steps.set_output.outputs.branch_name }}
needs_sync: ${{ steps.check_sync.outputs.needs_sync }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python dependencies
run: |
pip install tomli
- name: Check if sync is needed
id: check_sync
run: |
python scripts/release/check_kernel_version_to_sglang.py
- name: Configure Git and branch
if: steps.check_sync.outputs.needs_sync == 'true'
id: set_output
run: |
git config user.name "sglang-bot"
git config user.email "sglang-bot@users.noreply.github.com"
RANDOM_SUFFIX=$(echo $RANDOM | md5sum | head -c 4)
KERNEL_VERSION="${{ steps.check_sync.outputs.kernel_version }}"
BRANCH_NAME="bot/bump-kernel-version-to-sglang-${KERNEL_VERSION}-${RANDOM_SUFFIX}"
git checkout -b "$BRANCH_NAME"
echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
- name: Run kernel version bump script
if: steps.check_sync.outputs.needs_sync == 'true'
run: |
python scripts/release/bump_kernel_version_to_sglang.py
- name: Commit and create PR
if: steps.check_sync.outputs.needs_sync == 'true'
env:
GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
run: |
bash scripts/release/commit_and_pr_kernel_to_sglang.sh "$KERNEL_VERSION" "$BRANCH_NAME"
run-nightly-tests-nvidia:
needs: bump-kernel-version-to-sglang
if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true'
uses: ./.github/workflows/nightly-test-nvidia.yml
with:
ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }}
secrets: inherit
run-nightly-tests-amd:
needs: bump-kernel-version-to-sglang
if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true'
uses: ./.github/workflows/nightly-test-amd.yml
with:
ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }}
secrets: inherit
run-nightly-tests-npu:
needs: bump-kernel-version-to-sglang
if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true'
uses: ./.github/workflows/nightly-test-npu.yml
with:
ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }}
secrets: inherit
run-pr-tests-xeon:
needs: bump-kernel-version-to-sglang
if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true'
uses: ./.github/workflows/pr-test-xeon.yml
with:
ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }}
secrets: inherit
run-pr-tests-xpu:
needs: bump-kernel-version-to-sglang
if: needs.bump-kernel-version-to-sglang.outputs.needs_sync == 'true'
uses: ./.github/workflows/pr-test-xpu.yml
with:
ref: ${{ needs.bump-kernel-version-to-sglang.outputs.branch_name }}
secrets: inherit
================================================
FILE: .github/workflows/bot-bump-kernel-version.yml
================================================
name: Bot Bump Kernel Version
on:
workflow_dispatch:
inputs:
new_version:
description: 'New sgl-kernel version (e.g., 0.3.12)'
required: true
type: string
permissions:
contents: write
pull-requests: write
jobs:
bump-kernel-version:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python dependencies
run: |
pip install tomli
- name: Configure Git and branch
run: |
git config user.name "sglang-bot"
git config user.email "sglang-bot@users.noreply.github.com"
RANDOM_SUFFIX=$(echo $RANDOM | md5sum | head -c 4)
BRANCH_NAME="bot/bump-kernel-version-${{ github.event.inputs.new_version }}-${RANDOM_SUFFIX}"
git checkout -b "$BRANCH_NAME"
echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV
- name: Run kernel version bump script
run: |
python scripts/release/bump_kernel_version.py "${{ github.event.inputs.new_version }}"
- name: Commit and create PR
env:
GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
run: |
bash scripts/release/commit_and_pr.sh "sgl-kernel" "${{ github.event.inputs.new_version }}" "$BRANCH_NAME"
================================================
FILE: .github/workflows/bot-bump-sglang-version.yml
================================================
name: Bot Bump SGLang Version
on:
workflow_dispatch:
inputs:
new_version:
description: 'New SGLang version (e.g., 0.5.3 or 0.5.3rc0)'
required: true
type: string
permissions:
contents: write
pull-requests: write
jobs:
bump-sglang-version:
runs-on: ubuntu-latest
outputs:
branch_name: ${{ steps.set_output.outputs.branch_name }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python dependencies
run: |
pip install tomli
- name: Configure Git and branch
id: set_output
run: |
git config user.name "sglang-bot"
git config user.email "sglang-bot@users.noreply.github.com"
RANDOM_SUFFIX=$(echo $RANDOM | md5sum | head -c 4)
BRANCH_NAME="bot/bump-sglang-version-${{ github.event.inputs.new_version }}-${RANDOM_SUFFIX}"
git checkout -b "$BRANCH_NAME"
echo "BRANCH_NAME=$BRANCH_NAME" >> $GITHUB_ENV
echo "branch_name=$BRANCH_NAME" >> $GITHUB_OUTPUT
- name: Run SGLang version bump script
run: |
python scripts/release/bump_sglang_version.py "${{ github.event.inputs.new_version }}"
- name: Commit and create PR
env:
GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
run: |
bash scripts/release/commit_and_pr.sh "SGLang" "${{ github.event.inputs.new_version }}" "$BRANCH_NAME"
run-nightly-tests-nvidia:
needs: bump-sglang-version
uses: ./.github/workflows/nightly-test-nvidia.yml
with:
ref: ${{ needs.bump-sglang-version.outputs.branch_name }}
secrets: inherit
run-nightly-tests-amd:
needs: bump-sglang-version
uses: ./.github/workflows/nightly-test-amd.yml
with:
ref: ${{ needs.bump-sglang-version.outputs.branch_name }}
secrets: inherit
run-nightly-tests-npu:
needs: bump-sglang-version
uses: ./.github/workflows/nightly-test-npu.yml
with:
ref: ${{ needs.bump-sglang-version.outputs.branch_name }}
secrets: inherit
run-pr-tests-xeon:
needs: bump-sglang-version
uses: ./.github/workflows/pr-test-xeon.yml
with:
ref: ${{ needs.bump-sglang-version.outputs.branch_name }}
secrets: inherit
run-pr-tests-xpu:
needs: bump-sglang-version
uses: ./.github/workflows/pr-test-xpu.yml
with:
ref: ${{ needs.bump-sglang-version.outputs.branch_name }}
secrets: inherit
================================================
FILE: .github/workflows/bot-cherry-pick.yml
================================================
name: Bot Cherry Pick to Release Branch
on:
workflow_dispatch:
inputs:
commit_sha:
description: 'Commit SHA to cherry-pick (full or short hash)'
required: true
type: string
target_branch:
description: 'Target release branch (e.g., release/v0.5.7)'
required: true
type: string
create_pr:
description: 'Create a PR instead of pushing directly'
required: false
type: boolean
default: true
permissions:
contents: write
pull-requests: write
concurrency:
group: cherry-pick-${{ github.event.inputs.target_branch }}
cancel-in-progress: false
jobs:
cherry-pick:
if: github.repository == 'sgl-project/sglang'
runs-on: ubuntu-latest
environment: 'prod'
steps:
- name: Validate inputs
env:
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
run: |
if [[ ! "$TARGET_BRANCH" =~ ^release/v[0-9]+\.[0-9]+(\.[0-9]+)?$ ]]; then
echo "::error::Target branch must match pattern 'release/vX.Y' or 'release/vX.Y.Z' (e.g., release/v0.5.7)"
exit 1
fi
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
- name: Configure Git
run: |
git config user.name "sglang-bot"
git config user.email "sglang-bot@users.noreply.github.com"
- name: Validate target branch exists
env:
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
run: |
git fetch origin
if ! git ls-remote --exit-code --heads origin "$TARGET_BRANCH" > /dev/null 2>&1; then
echo "::error::Target branch '$TARGET_BRANCH' does not exist on remote"
exit 1
fi
- name: Get commit info
id: commit_info
env:
COMMIT_SHA_INPUT: ${{ github.event.inputs.commit_sha }}
run: |
# Verify commit exists
if ! git cat-file -t "$COMMIT_SHA_INPUT" > /dev/null 2>&1; then
echo "::error::Commit SHA '$COMMIT_SHA_INPUT' does not exist"
exit 1
fi
# Get full SHA if short hash provided
FULL_SHA=$(git rev-parse "$COMMIT_SHA_INPUT")
COMMIT_TITLE=$(git log -1 --format="%s" "$FULL_SHA")
SHORT_SHA=$(git rev-parse --short "$FULL_SHA")
echo "full_sha=$FULL_SHA" >> $GITHUB_OUTPUT
echo "short_sha=$SHORT_SHA" >> $GITHUB_OUTPUT
# Use delimiter for multiline-safe output
{
echo "commit_title<<EOF"
echo "$COMMIT_TITLE"
echo "EOF"
} >> $GITHUB_OUTPUT
echo "Cherry-picking commit: $SHORT_SHA - $COMMIT_TITLE"
- name: Cherry-pick commit
id: cherry_pick
env:
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
FULL_SHA: ${{ steps.commit_info.outputs.full_sha }}
SHORT_SHA: ${{ steps.commit_info.outputs.short_sha }}
CREATE_PR: ${{ github.event.inputs.create_pr }}
run: |
if [[ "$CREATE_PR" == "true" ]]; then
# Create a new branch for the PR
RANDOM_SUFFIX=$(head -c 4 /dev/urandom | xxd -p)
NEW_BRANCH="cherry-pick/${SHORT_SHA}-to-${TARGET_BRANCH#release/}-${RANDOM_SUFFIX}"
git checkout -b "$NEW_BRANCH" "origin/$TARGET_BRANCH"
echo "new_branch=$NEW_BRANCH" >> $GITHUB_OUTPUT
else
# Checkout target branch directly
git checkout "$TARGET_BRANCH"
fi
# Attempt cherry-pick
if git cherry-pick "$FULL_SHA"; then
echo "cherry_pick_success=true" >> $GITHUB_OUTPUT
else
echo "::error::Cherry-pick failed due to conflicts. Please resolve manually."
git cherry-pick --abort || true
echo "cherry_pick_success=false" >> $GITHUB_OUTPUT
exit 1
fi
- name: Push changes
if: steps.cherry_pick.outputs.cherry_pick_success == 'true'
env:
CREATE_PR: ${{ github.event.inputs.create_pr }}
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
NEW_BRANCH: ${{ steps.cherry_pick.outputs.new_branch }}
run: |
if [[ "$CREATE_PR" == "true" ]]; then
git push origin "$NEW_BRANCH"
else
git push origin "$TARGET_BRANCH"
fi
- name: Create Pull Request
if: steps.cherry_pick.outputs.cherry_pick_success == 'true' && github.event.inputs.create_pr == 'true'
env:
GH_TOKEN: ${{ secrets.GH_PAT_FOR_PULL_REQUEST }}
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
SHORT_SHA: ${{ steps.commit_info.outputs.short_sha }}
COMMIT_TITLE: ${{ steps.commit_info.outputs.commit_title }}
FULL_SHA: ${{ steps.commit_info.outputs.full_sha }}
NEW_BRANCH: ${{ steps.cherry_pick.outputs.new_branch }}
run: |
PR_TITLE="[Cherry-pick] ${COMMIT_TITLE} to ${TARGET_BRANCH}"
gh pr create \
--title "$PR_TITLE" \
--base "$TARGET_BRANCH" \
--head "$NEW_BRANCH" \
--label "cherry-pick" \
--body-file - <<EOF
Cherry-pick of commit ${FULL_SHA} to \`${TARGET_BRANCH}\`
**Original commit:** ${FULL_SHA}
**Original title:** ${COMMIT_TITLE}
---
*This PR was automatically created by the cherry-pick workflow.*
EOF
- name: Summary
if: always()
env:
FULL_SHA: ${{ steps.commit_info.outputs.full_sha }}
COMMIT_TITLE: ${{ steps.commit_info.outputs.commit_title }}
TARGET_BRANCH: ${{ github.event.inputs.target_branch }}
CHERRY_PICK_SUCCESS: ${{ steps.cherry_pick.outputs.cherry_pick_success }}
CREATE_PR: ${{ github.event.inputs.create_pr }}
NEW_BRANCH: ${{ steps.cherry_pick.outputs.new_branch }}
ACTOR: ${{ github.actor }}
run: |
echo "## Cherry-Pick Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Triggered by:** @${ACTOR}" >> $GITHUB_STEP_SUMMARY
echo "- **Commit:** ${FULL_SHA}" >> $GITHUB_STEP_SUMMARY
echo "- **Title:** ${COMMIT_TITLE}" >> $GITHUB_STEP_SUMMARY
echo "- **Target Branch:** ${TARGET_BRANCH}" >> $GITHUB_STEP_SUMMARY
if [[ "$CHERRY_PICK_SUCCESS" == "true" ]]; then
echo "- **Status:** ✅ Success" >> $GITHUB_STEP_SUMMARY
else
echo "- **Status:** ❌ Failed" >> $GITHUB_STEP_SUMMARY
fi
if [[ "$CREATE_PR" == "true" && "$CHERRY_PICK_SUCCESS" == "true" ]]; then
echo "- **PR Branch:** ${NEW_BRANCH}" >> $GITHUB_STEP_SUMMARY
fi
================================================
FILE: .github/workflows/cancel-pr-workflow-on-merge.yml
================================================
name: Cancel PR Workflows on Merge
on:
pull_request_target:
types:
- closed
permissions:
actions: write
jobs:
cancel:
if: github.event.pull_request.merged == true
runs-on: ubuntu-latest
steps:
- name: Cancel Previous Runs
uses: styfle/cancel-workflow-action@0.12.1
with:
workflow_id: all
access_token: ${{ secrets.GITHUB_TOKEN }}
ignore_sha: true
pr_number: ${{ github.event.pull_request.number }}
================================================
FILE: .github/workflows/cancel-unfinished-pr-tests.yml
================================================
name: Cancel Unfinished PR Runs
on:
workflow_dispatch:
inputs:
workflows:
description: 'Space-separated list of workflow filenames to cancel'
required: true
type: string
default: 'pr-test.yml'
permissions:
actions: write # Needed to cancel runs
contents: read # Needed to read repo info
pull-requests: read # needed for gh pr view (labels)
jobs:
cancel-unfinished-pr-runs:
runs-on: ubuntu-latest
steps:
- name: Install GitHub CLI
run: sudo apt-get install -y gh jq
- name: Cancel unfinished PR-associated runs (skip high-priority PRs)
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
WORKFLOWS: ${{ github.event.inputs.workflows || 'pr-test.yml' }}
shell: bash
run: |
set -euo pipefail
# Read the space-separated string from the input into a bash array
read -r -a WORKFLOW_FILES <<< "${WORKFLOWS}"
echo "Targeting ${#WORKFLOW_FILES[@]} workflow(s): ${WORKFLOWS}"
echo ""
for workflow_file in "${WORKFLOW_FILES[@]}"; do
echo "========================================="
echo "Workflow: $workflow_file"
echo "========================================="
# Get all unfinished runs
all_runs=$(gh run list \
--repo "$REPO" \
--workflow "$workflow_file" \
--json databaseId,status,event,url,createdAt \
--limit 1000 \
| jq -c '.[] | select(.status=="queued" or .status=="waiting" or .status=="in_progress")')
if [ -z "$all_runs" ]; then
echo "✅ No unfinished runs found"
echo ""
continue
fi
# Count runs by event type
total_runs=$(echo "$all_runs" | wc -l)
pr_runs=$(echo "$all_runs" | jq -s '[.[] | select(.event=="pull_request")] | length')
other_runs=$(echo "$all_runs" | jq -s '[.[] | select(.event!="pull_request")] | length')
echo "📊 Summary: $total_runs unfinished runs ($pr_runs PR-related, $other_runs other)"
echo ""
# Process non-PR runs first
if [ "$other_runs" -gt 0 ]; then
echo "--- Non-PR Runs ---"
echo "$all_runs" | jq -c 'select(.event!="pull_request")' | while read -r run; do
run_url=$(echo "$run" | jq -r '.url')
run_event=$(echo "$run" | jq -r '.event')
run_status=$(echo "$run" | jq -r '.status')
echo " • $run_event ($run_status): $run_url"
done
echo ""
fi
# Process PR runs
if [ "$pr_runs" -gt 0 ]; then
echo "--- PR Runs (checking for cancellation) ---"
echo "$all_runs" | jq -c 'select(.event=="pull_request")' | while read -r run; do
run_id=$(echo "$run" | jq -r '.databaseId')
run_url=$(echo "$run" | jq -r '.url')
run_status=$(echo "$run" | jq -r '.status')
echo ""
echo "Run ($run_status): $run_url"
# Fetch full run details to get head repository and branch info
run_details=$(gh api -H "Accept: application/vnd.github+json" \
"repos/$REPO/actions/runs/$run_id" 2>/dev/null || true)
if [ -z "$run_details" ]; then
echo " ⚠️ Could not fetch run details, skipping"
continue
fi
# Get head owner and branch (works for both fork and non-fork PRs)
head_owner=$(echo "$run_details" | jq -r '.head_repository.owner.login // empty')
head_branch=$(echo "$run_details" | jq -r '.head_branch // empty')
if [ -z "$head_owner" ] || [ -z "$head_branch" ]; then
echo " ⚠️ Missing head info, skipping"
continue
fi
echo " Branch: ${head_owner}:${head_branch}"
# Find PR by searching with head=owner:branch
pr_number=$(gh api -H "Accept: application/vnd.github+json" \
"repos/$REPO/pulls?state=open&head=${head_owner}:${head_branch}" \
--jq '.[0].number // empty' 2>/dev/null || true)
if [ -z "$pr_number" ]; then
echo " ⚠️ No open PR found, skipping"
continue
fi
pr_url="https://github.com/$REPO/pull/$pr_number"
echo " PR: $pr_url"
# Check for high priority label
labels=$(gh pr view "$pr_number" --repo "$REPO" --json labels \
| jq -r '.labels[].name' 2>/dev/null || true)
if echo "$labels" | grep -Fxq "high priority"; then
echo " 🛑 Skipping (high priority label)"
continue
fi
echo " 🚫 Cancelling..."
gh run cancel "$run_id" --repo "$REPO" || echo " ⚠️ Cancellation failed"
done
fi
echo ""
done
echo "========================================="
echo "✅ Processing complete"
echo "========================================="
================================================
FILE: .github/workflows/ci-coverage-overview.yml
================================================
name: CI Coverage Overview
on:
schedule:
- cron: '0 6 * * *' # Daily at 6 AM UTC
pull_request:
paths:
- '.github/workflows/ci-coverage-overview.yml'
- 'scripts/ci/utils/ci_coverage_report.py'
- 'test/registered/**'
workflow_dispatch:
inputs:
output_format:
description: 'Output format'
required: false
default: 'markdown'
type: choice
options:
- markdown
- json
jobs:
summary:
name: Summary
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Generate Summary Report
run: |
python scripts/ci/utils/ci_coverage_report.py --section summary
by-folder:
name: Tests by Folder
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Generate Tests by Folder Report
run: |
python scripts/ci/utils/ci_coverage_report.py --section by-folder
by-suite:
name: Tests by Suite
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Generate Tests by Suite Report
run: |
python scripts/ci/utils/ci_coverage_report.py --section by-suite
unit-test-coverage:
name: Unit Test Code Coverage
if: github.event_name != 'pull_request'
runs-on: 1-gpu-runner
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install dependencies
timeout-minutes: 10
run: |
pip install -e "python/[test]"
- name: Run unit tests with coverage
timeout-minutes: 10
run: |
pytest test/registered/unit/ \
--cov --cov-config=.coveragerc \
--cov-report=term-missing:skip-covered \
--continue-on-collection-errors \
-v | tee coverage_output.txt
- name: Write coverage to summary
if: always()
run: |
echo "## Unit Test Code Coverage" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Commit:** \`${GITHUB_SHA::8}\` | **Branch:** \`${GITHUB_REF_NAME}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
# Test result line (e.g., "== 42 passed, 1 failed in 23.5s ==")
echo '```' >> $GITHUB_STEP_SUMMARY
grep -E '^=+.*passed' coverage_output.txt >> $GITHUB_STEP_SUMMARY || true
echo "" >> $GITHUB_STEP_SUMMARY
# Coverage total
grep -E '^TOTAL ' coverage_output.txt >> $GITHUB_STEP_SUMMARY || true
echo '```' >> $GITHUB_STEP_SUMMARY
# Partially covered core modules (1-49%) — most actionable for contributors
# Only show modules with testable logic; skip configs, models, layers, etc.
LOW_COV=$(awk '/^python\/.*%/ {
for (i=1; i<=NF; i++) {
if ($i ~ /^[0-9]+%$/) {
pct = $i + 0
if (pct >= 1 && pct < 50) printf "%-80s %5s %s\n", $1, $(i-2), $i
break
}
}
}' coverage_output.txt \
| grep -E '/(mem_cache|managers|sampling|parser|observability|function_call|entrypoints|speculative|multimodal|utils)/' \
| head -40 || true)
if [ -n "$LOW_COV" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "<details><summary>Core modules with coverage below 50% — good candidates for more unit tests</summary>" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
echo "$LOW_COV" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
echo "</details>" >> $GITHUB_STEP_SUMMARY
fi
json-export:
name: JSON Export
runs-on: ubuntu-latest
if: inputs.output_format == 'json'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Generate JSON Report
run: |
python scripts/ci/utils/ci_coverage_report.py --output-format json > ci_coverage.json
- name: Upload JSON artifact
uses: actions/upload-artifact@v4
with:
name: ci-coverage-report
path: ci_coverage.json
================================================
FILE: .github/workflows/ci-failure-monitor.yml
================================================
name: CI Failure Monitor
on:
schedule:
- cron: '0 */12 * * *' # Every 12 hour
workflow_dispatch:
concurrency:
group: ci-failure-monitor-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
actions: read
jobs:
failure-analysis:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.14'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests slack_sdk
- name: Run Failure Analysis
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
GH_PAT_FOR_RUNNER_ADMIN: ${{ secrets.GH_PAT_FOR_RUNNER_ADMIN }}
PYTHONUNBUFFERED: 1
PYTHONIOENCODING: utf-8
run: |
cd scripts/ci_monitor
python ci_failures_analysis.py \
--token $GITHUB_TOKEN \
--limit 100 \
--output ci_failure_analysis_$(date +%Y%m%d_%H%M%S).json
- name: Upload Analysis Results
uses: actions/upload-artifact@v4
with:
name: ci-failure-analysis-${{ github.run_number }}
path: |
scripts/ci_monitor/ci_failure_analysis_*.json
retention-days: 7
- name: Send Slack Notification
if: always()
env:
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
run: |
cd scripts/ci_monitor
LATEST_REPORT=$(ls -t ci_failure_analysis_*.json | head -1)
if [ ! -f "$LATEST_REPORT" ]; then
echo "No report found, so skipping Slack notification"
exit 0
fi
if [ -n "$SGLANG_DIFFUSION_SLACK_TOKEN" ]; then
python3 post_ci_failures_to_slack.py --report-file "$LATEST_REPORT"
else
echo "SGLANG_DIFFUSION_SLACK_TOKEN not configured, skipping notification"
fi
================================================
FILE: .github/workflows/close-inactive-issues.yml
================================================
name: Close Inactive Issues
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
permissions:
issues: write
contents: read
jobs:
close-inactive-issues:
if: github.repository == 'sgl-project/sglang'
runs-on: ubuntu-latest
steps:
- name: Check and close inactive issues
uses: actions/github-script@v6
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
console.log(`Owner: ${owner}, Repo: ${repo}`);
async function fetchIssues(page = 1) {
console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
return await github.rest.issues.listForRepo({
owner,
repo,
state: 'open',
sort: 'updated',
direction: 'asc',
per_page: 100,
page: page
});
}
async function processIssues() {
console.log('Starting to process issues');
console.log(`Repository: ${owner}/${repo}`);
let page = 1;
let hasMoreIssues = true;
while (hasMoreIssues) {
try {
const issues = await fetchIssues(page);
console.log(`Fetched ${issues.data.length} issues on page ${page}`);
if (issues.data.length === 0) {
hasMoreIssues = false;
break;
}
for (const issue of issues.data) {
// Skip if the issue has 'good first issue' label
if (issue.labels.some(label => label.name === 'good first issue')) {
console.log(`Skipping issue #${issue.number} as it's marked as 'good first issue'`);
continue;
}
if (new Date(issue.updated_at) < sixtyDaysAgo) {
try {
await github.rest.issues.update({
owner,
repo,
issue_number: issue.number,
state: 'closed',
labels: [...issue.labels.map(l => l.name), 'inactive']
});
await github.rest.issues.createComment({
owner,
repo,
issue_number: issue.number,
body: 'This issue has been automatically closed due to inactivity. Please feel free to reopen it if needed.'
});
console.log(`Closed issue #${issue.number} due to inactivity.`);
} catch (error) {
console.error(`Failed to close issue #${issue.number}: ${error.message}`);
}
} else {
console.log(`Issue #${issue.number} is still active. Stopping processing.`);
hasMoreIssues = false;
break;
}
}
page += 1;
} catch (error) {
console.error(`Error fetching issues on page ${page}: ${error.message}`);
hasMoreIssues = false;
}
}
console.log('Finished processing issues');
}
await processIssues();
================================================
FILE: .github/workflows/diffusion-ci-gt-gen.yml
================================================
name: Diffusion CI Ground Truth Generation
on:
workflow_dispatch:
inputs:
ref:
description: 'Git ref to checkout'
required: false
default: ''
type: string
case_ids:
description: 'Specific case IDs to run (space-separated, optional)'
required: false
default: ''
type: string
concurrency:
group: diffusion-ci-gt-gen-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: write
actions: read
jobs:
multimodal-diffusion-gen-1gpu:
if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner
strategy:
matrix:
part: [0, 1]
timeout-minutes: 150
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- name: Install dependencies
run: bash scripts/ci/cuda/ci_install_dependency.sh diffusion
- name: Generate outputs
run: |
cd python
python -m sglang.multimodal_gen.test.scripts.gen_diffusion_ci_outputs \
--suite 1-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2 \
--out-dir ./diffusion-ci-outputs \
--continue-on-error \
${{ inputs.case_ids != '' && format('--case-ids {0}', inputs.case_ids) || '' }}
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: diffusion-gen-1gpu-part${{ matrix.part }}
path: python/diffusion-ci-outputs
retention-days: 7
multimodal-diffusion-gen-2gpu:
if: github.repository == 'sgl-project/sglang'
runs-on: 2-gpu-runner
strategy:
matrix:
part: [0, 1]
timeout-minutes: 150
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.ref || github.ref }}
- name: Install dependencies
run: bash scripts/ci/cuda/ci_install_dependency.sh diffusion
- name: Generate outputs
run: |
cd python
python -m sglang.multimodal_gen.test.scripts.gen_diffusion_ci_outputs \
--suite 2-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 2 \
--out-dir ./diffusion-ci-outputs \
--continue-on-error \
${{ inputs.case_ids != '' && format('--case-ids {0}', inputs.case_ids) || '' }}
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: diffusion-gen-2gpu-part${{ matrix.part }}
path: python/diffusion-ci-outputs
retention-days: 7
diffusion-ci-push:
needs: [multimodal-diffusion-gen-1gpu, multimodal-diffusion-gen-2gpu]
if: github.repository == 'sgl-project/sglang'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
uses: actions/download-artifact@v4
with:
pattern: diffusion-gen-*
path: combined
merge-multiple: true
- name: Collect image files
run: |
mkdir -p gt_images
find combined \( -name "*.png" -o -name "*.jpg" -o -name "*.jpeg" -o -name "*.webp" \) -type f -exec cp -f {} gt_images/ \;
- name: Publish GT images to sglang-bot/sglang-ci-data
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
run: python scripts/ci/utils/publish_diffusion_gt.py --source-dir gt_images
================================================
FILE: .github/workflows/execute-notebook.yml
================================================
name: Execute Notebooks
on:
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened, labeled]
paths:
- "python/sglang/**"
- "docs/**"
- "!python/sglang/**/*.md"
- "!docs/**/*.md"
workflow_dispatch:
concurrency:
group: execute-notebook-${{ github.ref }}
cancel-in-progress: true
env:
SGLANG_IS_IN_CI: true
jobs:
call-gate:
# Align with PR Test: fail fast if PR doesn't have run-ci label.
# This makes /tag-and-rerun-ci work by rerunning this failed workflow.
uses: ./.github/workflows/pr-gate.yml
secrets: inherit
run-all-notebooks:
needs: [call-gate]
runs-on: 1-gpu-runner
if: github.event_name != 'pull_request' || needs.call-gate.result == 'success'
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install dependencies
run: |
bash scripts/ci/cuda/ci_install_dependency.sh
pip install -r docs/requirements.txt
apt-get update && apt-get install -y pandoc parallel retry
ln -sf "$(which python3)" /usr/bin/python
- name: Setup Jupyter Kernel
run: |
python -m ipykernel install --user --name python3 --display-name "Python 3"
- name: Execute notebooks
timeout-minutes: 40
run: |
cd docs
make clean
make compile
notebook-finish:
needs: [
call-gate,
run-all-notebooks
]
runs-on: ubuntu-latest
if: always() && needs.run-all-notebooks.result != 'skipped'
steps:
- name: Check all dependent job statuses
run: |
results=(${{ join(needs.*.result, ' ') }})
for result in "${results[@]}"; do
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then
echo "Job failed with result: $result"
exit 1
fi
done
echo "All jobs completed successfully"
exit 0
================================================
FILE: .github/workflows/labeler.yml
================================================
name: Auto Label PRs
on:
pull_request_target:
types: [opened, synchronize, reopened]
permissions:
contents: read
pull-requests: write
jobs:
label:
runs-on: ubuntu-latest
steps:
- name: Auto-label by file changes
uses: actions/labeler@v5
with:
repo-token: "${{ secrets.GITHUB_TOKEN }}"
configuration-path: .github/labeler.yml
sync-labels: false
================================================
FILE: .github/workflows/lint.yml
================================================
name: Lint
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: Install pre-commit hook
run: |
python -m pip install pre-commit
pre-commit install
- name: Run pre-commit checks
run: SKIP=no-commit-to-branch pre-commit run --all-files --show-diff-on-failure
- name: Run sgl-kernel clang-format checks
uses: DoozyX/clang-format-lint-action@v0.20
with:
source: sgl-kernel
extensions: h,c,cpp,hpp,cu,cuh,cc
clangFormatVersion: 20
style: file
================================================
FILE: .github/workflows/list-active-pr-runs.yml.yml
================================================
name: List Active Runs
on:
workflow_dispatch:
inputs:
workflows:
description: 'Space-separated list of workflow filenames to check'
required: false
type: string
default: 'pr-test.yml'
permissions:
actions: read
contents: read
pull-requests: read
jobs:
list-active-runs:
runs-on: ubuntu-latest
steps:
- name: Install GitHub CLI
run: sudo apt-get install -y gh jq
- name: List active runs grouped by PR
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO: ${{ github.repository }}
WORKFLOWS: ${{ github.event.inputs.workflows || 'pr-test.yml' }}
shell: bash
run: |
set -euo pipefail
echo "========================================="
echo "🔍 Active Workflow Runs Report"
echo "========================================="
echo ""
# Get all workflows or specific ones
read -r -a workflow_files <<< "${WORKFLOWS}"
echo "📋 Checking specified workflows: ${WORKFLOWS}"
echo ""
# Create a temporary file to store PR data
pr_data_file=$(mktemp)
# Process each workflow
for workflow_file in ${workflow_files[@]}; do
echo "Scanning workflow: $workflow_file"
# Get all active runs (queued, waiting, in_progress)
active_runs=$(gh run list \
--repo "$REPO" \
--workflow "$workflow_file" \
--json databaseId,status,event,headBranch,createdAt,updatedAt,headSha,number,attempt \
--limit 500 \
| jq -c '.[] | select(.status=="queued" or .status=="waiting" or .status=="in_progress")')
if [ -z "$active_runs" ]; then
continue
fi
# Process each run
echo "$active_runs" | while read -r run; do
run_id=$(echo "$run" | jq -r '.databaseId')
run_status=$(echo "$run" | jq -r '.status')
run_event=$(echo "$run" | jq -r '.event')
created_at=$(echo "$run" | jq -r '.createdAt')
head_sha=$(echo "$run" | jq -r '.headSha')
run_number=$(echo "$run" | jq -r '.number')
run_attempt=$(echo "$run" | jq -r '.attempt // 1')
# Get detailed run information including jobs
run_details=$(gh api "repos/$REPO/actions/runs/$run_id" 2>/dev/null || true)
if [ -z "$run_details" ]; then
continue
fi
head_owner=$(echo "$run_details" | jq -r '.head_repository.owner.login // empty')
head_branch=$(echo "$run_details" | jq -r '.head_branch // empty')
if [ -z "$head_owner" ] || [ -z "$head_branch" ]; then
continue
fi
# Find PR number (may be empty for non-PR runs)
pr_number=$(gh api "repos/$REPO/pulls?state=open&head=${head_owner}:${head_branch}" \
--jq '.[0].number // empty' 2>/dev/null || true)
if [ -z "$pr_number" ]; then
pr_number="NO_PR"
fi
# Get jobs for this run (with pagination to avoid missing jobs)
jobs=$(gh api "repos/$REPO/actions/runs/$run_id/jobs" --paginate --jq '.jobs[]' | jq -s '.')
running_jobs=$(echo "$jobs" | jq '[.[] | select(.status=="in_progress")] | length')
queued_jobs=$(echo "$jobs" | jq '[.[] | select(.status=="queued" or .status=="waiting")] | length')
# Get runner info for running jobs
runners=$(echo "$jobs" | jq -r '.[] | select(.status=="in_progress") | .runner_name // "N/A"' | paste -sd "," -)
# Calculate queue time
current_time=$(date -u +%s)
created_time=$(date -u -d "$created_at" +%s 2>/dev/null || echo "$current_time")
queue_time=$((current_time - created_time))
queue_minutes=$((queue_time / 60))
# Store data in temporary file (unified format with event and branch)
echo "$pr_number|$workflow_file|$run_id|$run_status|$running_jobs|$queued_jobs|$runners|$queue_minutes|$created_at|$head_sha|$run_attempt|$run_event|$head_branch" >> "$pr_data_file"
done
done
echo ""
echo "========================================="
echo "📊 Active Runs Summary"
echo "========================================="
echo ""
if [ ! -s "$pr_data_file" ]; then
echo "✅ No active runs found"
rm -f "$pr_data_file"
exit 0
fi
# Get unique PR numbers (exclude NO_PR entries)
pr_numbers=$(cut -d'|' -f1 < "$pr_data_file" | grep -v '^NO_PR$' | sort -u || true)
# Separate high priority and normal PRs
high_priority_prs=()
normal_prs=()
for pr_num in $pr_numbers; do
labels=$(gh pr view "$pr_num" --repo "$REPO" --json labels \
| jq -r '.labels[].name' 2>/dev/null || true)
if echo "$labels" | grep -Fxq "high priority"; then
high_priority_prs+=($pr_num)
else
normal_prs+=($pr_num)
fi
done
# Combine: high priority first, then normal
sorted_pr_numbers=("${high_priority_prs[@]}" "${normal_prs[@]}")
pr_count=0
total_running=0
total_queued=0
for pr_num in "${sorted_pr_numbers[@]}"; do
pr_count=$((pr_count + 1))
# Get PR details
pr_info=$(gh pr view "$pr_num" --repo "$REPO" --json title,author,labels,url 2>/dev/null || true)
if [ -z "$pr_info" ]; then
continue
fi
pr_title=$(echo "$pr_info" | jq -r '.title')
pr_author=$(echo "$pr_info" | jq -r '.author.login')
pr_url=$(echo "$pr_info" | jq -r '.url')
pr_labels=$(echo "$pr_info" | jq -r '.labels[].name' | paste -sd ", " -)
if [ -z "$pr_labels" ]; then
pr_labels="(no labels)"
fi
# Add priority indicator
priority_indicator=""
if echo "$pr_labels" | grep -q "high priority"; then
priority_indicator="🔴 [HIGH PRIORITY] "
fi
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "🔗 ${priority_indicator}PR #$pr_num: $pr_title"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "👤 Author: $pr_author"
echo "🏷️ Labels: $pr_labels"
echo "🔗 URL: $pr_url"
echo ""
# Get all runs for this PR
pr_runs=$(grep "^$pr_num|" "$pr_data_file")
pr_running_total=0
pr_queued_total=0
echo "$pr_runs" | while read -r line; do
workflow=$(echo "$line" | cut -d'|' -f2)
run_id=$(echo "$line" | cut -d'|' -f3)
status=$(echo "$line" | cut -d'|' -f4)
running=$(echo "$line" | cut -d'|' -f5)
queued=$(echo "$line" | cut -d'|' -f6)
runners=$(echo "$line" | cut -d'|' -f7)
queue_min=$(echo "$line" | cut -d'|' -f8)
created=$(echo "$line" | cut -d'|' -f9)
attempt=$(echo "$line" | cut -d'|' -f11)
pr_running_total=$((pr_running_total + running))
pr_queued_total=$((pr_queued_total + queued))
run_url="https://github.com/$REPO/actions/runs/$run_id"
# Calculate retry count for this specific run
retry_count=$((attempt - 1))
# Show retry indicator
retry_indicator=""
if [ "$retry_count" -gt 0 ]; then
retry_indicator=" 🔄 Retry #$retry_count"
fi
echo " 📦 Workflow: $workflow (Run #$run_id)$retry_indicator"
echo " Status: $status"
echo " 🟢 Running jobs: $running"
echo " 🟡 Queued jobs: $queued"
if [ "$running" -gt 0 ] && [ "$runners" != "" ]; then
echo " 🖥️ Runners: $runners"
fi
if [ "$queue_min" -gt 0 ]; then
echo " ⏱️ Queue time: ${queue_min} minutes"
fi
echo " 🔗 Run URL: $run_url"
echo ""
done
# Summary for this PR
pr_running_total=$(grep "^$pr_num|" "$pr_data_file" | cut -d'|' -f5 | awk '{sum+=$1} END {print sum+0}')
pr_queued_total=$(grep "^$pr_num|" "$pr_data_file" | cut -d'|' -f6 | awk '{sum+=$1} END {print sum+0}')
total_running=$((total_running + pr_running_total))
total_queued=$((total_queued + pr_queued_total))
echo " 📊 PR Total: $pr_running_total running, $pr_queued_total queued"
echo ""
done
# --- Non-PR Runs Section ---
non_pr_runs=$(grep '^NO_PR|' "$pr_data_file" 2>/dev/null || true)
non_pr_running=0
non_pr_queued=0
if [ -n "$non_pr_runs" ]; then
echo "========================================="
echo "📦 Non-PR Runs (manual / scheduled / other)"
echo "========================================="
echo ""
echo "$non_pr_runs" | while read -r line; do
workflow=$(echo "$line" | cut -d'|' -f2)
run_id=$(echo "$line" | cut -d'|' -f3)
status=$(echo "$line" | cut -d'|' -f4)
running=$(echo "$line" | cut -d'|' -f5)
queued=$(echo "$line" | cut -d'|' -f6)
runners=$(echo "$line" | cut -d'|' -f7)
queue_min=$(echo "$line" | cut -d'|' -f8)
created=$(echo "$line" | cut -d'|' -f9)
attempt=$(echo "$line" | cut -d'|' -f11)
event=$(echo "$line" | cut -d'|' -f12)
branch=$(echo "$line" | cut -d'|' -f13)
run_url="https://github.com/$REPO/actions/runs/$run_id"
retry_count=$((attempt - 1))
retry_indicator=""
if [ "$retry_count" -gt 0 ]; then
retry_indicator=" 🔄 Retry #$retry_count"
fi
echo " 📦 Workflow: $workflow (Run #$run_id)$retry_indicator"
echo " Event: $event"
echo " Branch: $branch"
echo " Status: $status"
echo " 🟢 Running jobs: $running"
echo " 🟡 Queued jobs: $queued"
if [ "$running" -gt 0 ] && [ "$runners" != "" ]; then
echo " 🖥️ Runners: $runners"
fi
if [ "$queue_min" -gt 0 ]; then
echo " ⏱️ Queue time: ${queue_min} minutes"
fi
echo " 🔗 Run URL: $run_url"
echo ""
done
non_pr_running=$(echo "$non_pr_runs" | cut -d'|' -f5 | awk '{sum+=$1} END {print sum+0}')
non_pr_queued=$(echo "$non_pr_runs" | cut -d'|' -f6 | awk '{sum+=$1} END {print sum+0}')
non_pr_count=$(echo "$non_pr_runs" | wc -l | tr -d ' ')
total_running=$((total_running + non_pr_running))
total_queued=$((total_queued + non_pr_queued))
echo " 📊 Non-PR Total: $non_pr_running running, $non_pr_queued queued"
echo ""
fi
# Overall summary
echo "========================================="
echo "📈 Overall Summary"
echo "========================================="
echo "Total PRs with active runs: $pr_count"
echo "Total non-PR active runs: ${non_pr_count:-0}"
echo "Total running jobs: $total_running"
echo "Total queued jobs: $total_queued"
echo "========================================="
# Cleanup
rm -f "$pr_data_file"
================================================
FILE: .github/workflows/nightly-release-gateway.yml
================================================
# Nightly release workflow for SGLang Model Gateway
name: Nightly Release SGLang Model Gateway to PyPI
on:
schedule:
# Run at 2 AM UTC every day
- cron: '0 2 * * *'
workflow_dispatch: # Allow manual trigger
jobs:
build:
name: build on ${{ matrix.platform || matrix.os }} (${{ matrix.target }} - ${{ matrix.manylinux || 'auto' }})
runs-on: ${{ matrix.os }}-latest
strategy:
fail-fast: false
matrix:
os: [ubuntu, macos, windows]
target: [x86_64, aarch64]
manylinux: [auto]
include:
- os: ubuntu
platform: linux
- os: windows
ls: dir
target: x86_64
python-architecture: x64
interpreter: 3.9 3.10 3.11 3.12 3.13
- os: macos
target: aarch64
interpreter: 3.9 3.10 3.11 3.12 3.13
- os: ubuntu
platform: linux
target: aarch64
# musllinux
- os: ubuntu
platform: linux
target: x86_64
manylinux: musllinux_1_1
- os: ubuntu
platform: linux
target: aarch64
manylinux: musllinux_1_1
exclude:
- os: windows
target: aarch64
steps:
- uses: actions/checkout@v4
with:
path: sglang-repo
- name: Move sgl-model-gateway folder to root and delete sglang-repo
run: |
mv sglang-repo/sgl-model-gateway/* .
rm -rf sglang-repo
ls -alt
shell: bash
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.13"
architecture: ${{ matrix.python-architecture || 'x64' }}
- name: Modify version for nightly release
run: |
# Get current version from pyproject.toml
CURRENT_VERSION=$(python -c "import tomllib; print(tomllib.load(open('bindings/python/pyproject.toml', 'rb'))['project']['version'])" 2>/dev/null || python -c "import tomli; print(tomli.load(open('bindings/python/pyproject.toml', 'rb'))['project']['version'])")
# Create nightly version with date: e.g., 0.2.1.dev20250128
NIGHTLY_VERSION="${CURRENT_VERSION}.dev$(date +%Y%m%d)"
echo "Nightly version: $NIGHTLY_VERSION"
# Update pyproject.toml with nightly version (temporary, not committed)
sed -i.bak "s/version = \"${CURRENT_VERSION}\"/version = \"${NIGHTLY_VERSION}\"/" bindings/python/pyproject.toml
# Verify the change
cat bindings/python/pyproject.toml | grep "^version"
shell: bash
- name: Install twine and tomli
run: pip install -U twine tomli
- name: Install protoc (macOS)
if: matrix.os == 'macos'
run: brew install protobuf
- name: Install protoc (Windows)
if: matrix.os == 'windows'
run: choco install protoc -y
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
working-directory: bindings/python
target: ${{ matrix.target }}
manylinux: ${{ matrix.manylinux || 'auto' }}
args: --release --out dist --features vendored-openssl --interpreter ${{ matrix.interpreter || '3.9 3.10 3.11 3.12 3.13 3.14' }}
rust-toolchain: stable
docker-options: -e CI -e CC_aarch64_unknown_linux_gnu=aarch64-linux-gnu-gcc -e CXX_aarch64_unknown_linux_gnu=aarch64-linux-gnu-g++
before-script-linux: |
# Install build dependencies (perl/make for vendored OpenSSL, protoc for gRPC)
if command -v yum &> /dev/null; then
yum update -y && yum install -y wget unzip gcc gcc-c++ perl-core make
# Install cross-compilation toolchain for aarch64 if needed
if [ "${{ matrix.target }}" = "aarch64" ]; then
yum install -y gcc-aarch64-linux-gnu gcc-c++-aarch64-linux-gnu || true
fi
elif command -v apt-get &> /dev/null; then
apt-get update && apt-get install -y wget unzip gcc g++ perl make
# Install cross-compilation toolchain for aarch64 if needed
if [ "${{ matrix.target }}" = "aarch64" ]; then
apt-get install -y gcc-aarch64-linux-gnu g++-aarch64-linux-gnu || true
fi
fi
(cd /tmp && \
wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/protoc-32.0-linux-x86_64.zip && \
unzip protoc-32.0-linux-x86_64.zip -d /usr/local && \
rm protoc-32.0-linux-x86_64.zip)
protoc --version
- name: List built packages
run: ${{ matrix.ls || 'ls -lh' }} bindings/python/dist/
- name: Check packages
run: twine check --strict bindings/python/dist/*
- uses: actions/upload-artifact@v4
with:
name: packages-${{ matrix.os }}-${{ matrix.target }}-${{ matrix.manylinux || 'auto' }}
path: bindings/python/dist/
build-sdist:
name: Build SDist
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
path: sglang-repo
- name: Move sgl-model-gateway folder to root and delete sglang-repo
run: |
mv sglang-repo/sgl-model-gateway/* .
rm -rf sglang-repo
ls -alt
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.13"
- name: Modify version for nightly release
run: |
# Get current version from pyproject.toml
CURRENT_VERSION=$(python -c "import tomllib; print(tomllib.load(open('bindings/python/pyproject.toml', 'rb'))['project']['version'])" 2>/dev/null || python -c "import tomli; print(tomli.load(open('bindings/python/pyproject.toml', 'rb'))['project']['version'])")
# Create nightly version with date: e.g., 0.2.1.dev20250128
NIGHTLY_VERSION="${CURRENT_VERSION}.dev$(date +%Y%m%d)"
echo "Nightly version: $NIGHTLY_VERSION"
# Update pyproject.toml with nightly version (temporary, not committed)
sed -i "s/version = \"${CURRENT_VERSION}\"/version = \"${NIGHTLY_VERSION}\"/" bindings/python/pyproject.toml
# Verify the change
cat bindings/python/pyproject.toml | grep "^version"
- name: Build SDist
uses: PyO3/maturin-action@v1
with:
working-directory: bindings/python
command: sdist
args: --out dist
rust-toolchain: stable
- uses: actions/upload-artifact@v4
with:
name: sdist
path: bindings/python/dist/*.tar.gz
upload:
name: Upload to TestPyPI
if: github.repository == 'sgl-project/sglang' # Ensure this job only runs for the sgl-project/sglang repository
needs: [build, build-sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v4
with:
path: dist
merge-multiple: true
- name: Upload to TestPyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN_ROUTER }}
run: |
pip install twine
twine upload --repository testpypi dist/* --verbose
================================================
FILE: .github/workflows/nightly-test-amd-rocm720.yml
================================================
name: Nightly Test (AMD ROCm 7.2)
on:
schedule:
- cron: '30 17 * * *'
push:
branches:
- main
paths:
- "python/sglang/version.py"
workflow_dispatch:
inputs:
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: true
job_select:
description: 'Select a job to run from dropdown (choose "all" to run all jobs)'
required: false
type: choice
default: 'all'
options:
- 'all'
- nightly-test-1-gpu-unit-rocm720
- nightly-accuracy-2-gpu-rocm720
- nightly-accuracy-2-gpu-vlm-rocm720
- nightly-perf-2-gpu-text-rocm720
- nightly-perf-2-gpu-vlm-rocm720
- nightly-accuracy-8-gpu-rocm720
- nightly-8-gpu-grok1-int4-rocm720
- nightly-8-gpu-grok2-rocm720
- nightly-8-gpu-deepseek-v31-rocm720
- nightly-8-gpu-deepseek-v32-rocm720
- nightly-8-gpu-deepseek-v32-mtp-rocm720
- nightly-8-gpu-deepseek-v3-kv-fp8-rocm720
- nightly-8-gpu-kimi-k25-rocm720
- nightly-8-gpu-qwen3-235b-rocm720
- nightly-8-gpu-qwen35-rocm720
- nightly-8-gpu-glm5-rocm720
- nightly-8-gpu-minimax-m25-rocm720
- nightly-1-gpu-zimage-turbo-rocm720
- nightly-test-1-gpu-mi35x-rocm720
- nightly-accuracy-8-gpu-mi35x-rocm720
- nightly-8-gpu-mi35x-grok1-int4-rocm720
- nightly-8-gpu-mi35x-grok2-rocm720
- nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720
- nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720
- nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720
- nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720
- nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720
- nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720
- nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720
- nightly-8-gpu-mi35x-kimi-k25-rocm720
- nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720
- nightly-8-gpu-mi35x-qwen35-rocm720
- nightly-8-gpu-mi35x-glm5-rocm720
- nightly-8-gpu-mi35x-minimax-m25-rocm720
job_filter:
description: 'Or type comma-separated job names (overrides dropdown if non-empty)'
required: false
type: string
default: ''
workflow_call:
inputs:
ref:
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
required: false
type: string
default: ''
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
job_filter:
description: 'Select which job to run (leave empty or "all" to run all jobs)'
required: false
type: string
default: 'all'
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: true
env:
AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}
concurrency:
# When called via workflow_call with ref set, use a unique group per caller run to avoid
# collisions with direct schedule/push triggers. We use inputs.ref (not github.event_name)
# to detect this, because github.event_name inherits from the caller in workflow_call.
group: nightly-test-amd-rocm720-${{ inputs.ref && format('caller-{0}', github.run_id) || github.ref }}
cancel-in-progress: ${{ !inputs.ref && github.even
Showing preview only (204K chars total). Download the full file or copy to clipboard to get everything.
gitextract_8m4rfrdr/
├── .claude/
│ └── skills/
│ ├── add-jit-kernel/
│ │ └── SKILL.md
│ ├── add-sgl-kernel/
│ │ └── SKILL.md
│ ├── sglang-bisect-ci-regression/
│ │ └── SKILL.md
│ └── write-sglang-test/
│ └── SKILL.md
├── .codespellrc
├── .coveragerc
├── .devcontainer/
│ ├── Dockerfile
│ └── devcontainer.json
├── .github/
│ ├── CI_PERMISSIONS.json
│ ├── CODEOWNERS
│ ├── FOLDER_README.md
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1-bug-report.yml
│ │ └── 2-feature-request.yml
│ ├── MAINTAINER.md
│ ├── actions/
│ │ ├── upload-cuda-coredumps/
│ │ │ └── action.yml
│ │ └── wait-for-jobs/
│ │ └── action.yml
│ ├── labeler.yml
│ ├── pull_request_template.md
│ ├── update_ci_permission.py
│ └── workflows/
│ ├── amd-aiter-scout.yml
│ ├── amd-ci-job-monitor.yml
│ ├── auto-tune.yml
│ ├── bot-bump-flashinfer-version.yml
│ ├── bot-bump-kernel-version-to-sglang.yml
│ ├── bot-bump-kernel-version.yml
│ ├── bot-bump-sglang-version.yml
│ ├── bot-cherry-pick.yml
│ ├── cancel-pr-workflow-on-merge.yml
│ ├── cancel-unfinished-pr-tests.yml
│ ├── ci-coverage-overview.yml
│ ├── ci-failure-monitor.yml
│ ├── close-inactive-issues.yml
│ ├── diffusion-ci-gt-gen.yml
│ ├── execute-notebook.yml
│ ├── labeler.yml
│ ├── lint.yml
│ ├── list-active-pr-runs.yml.yml
│ ├── nightly-release-gateway.yml
│ ├── nightly-test-amd-rocm720.yml
│ ├── nightly-test-amd.yml
│ ├── nightly-test-intel.yml
│ ├── nightly-test-npu.yml
│ ├── nightly-test-nvidia.yml
│ ├── open-pr-copy-from-oss.yml
│ ├── open-pr-copy-to-oss.yml
│ ├── patch-docker-dev.yml
│ ├── pr-benchmark-rust.yml
│ ├── pr-gate.yml
│ ├── pr-test-amd-rocm720.yml
│ ├── pr-test-amd.yml
│ ├── pr-test-npu.yml
│ ├── pr-test-rust.yml
│ ├── pr-test-xeon.yml
│ ├── pr-test-xpu.yml
│ ├── pr-test.yml
│ ├── release-branch-cut.yml
│ ├── release-docker-amd-nightly.yml
│ ├── release-docker-amd-rocm720-nightly.yml
│ ├── release-docker-amd.yml
│ ├── release-docker-cu13-framework.yml
│ ├── release-docker-dev.yml
│ ├── release-docker-gateway.yml
│ ├── release-docker-npu-nightly.yml
│ ├── release-docker-npu.yml
│ ├── release-docker-xeon.yml
│ ├── release-docker.yml
│ ├── release-docs.yml
│ ├── release-pypi-gateway.yml
│ ├── release-pypi-nightly.yml
│ ├── release-pypi-pr.yml
│ ├── release-pypi.yml
│ ├── release-tag.yml
│ ├── release-whl-kernel.yml
│ ├── rerun-ut.yml
│ ├── retag-docker.yml
│ ├── runner-utilization.yml
│ ├── slash-command-handler.yml
│ ├── stress-test.yml
│ └── weekly-test-nvidia.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── 3rdparty/
│ └── amd/
│ ├── profiling/
│ │ ├── PROFILING.md
│ │ ├── client.sh
│ │ ├── install_rpd.sh
│ │ ├── loadTracer.sh
│ │ ├── rpd.patch
│ │ ├── rpd_profile_server_enable.patch
│ │ ├── rpd_profile_server_enable_wCPU_activities.patch
│ │ ├── server.sh
│ │ └── torch_profiler.patch
│ ├── tuning/
│ │ ├── TUNING.md
│ │ └── benchmark_moe_rocm.py
│ └── wheel/
│ ├── README.md
│ ├── sgl-kernel/
│ │ ├── CMakeLists_rocm.txt
│ │ ├── build_rocm.sh
│ │ ├── rename_wheels_rocm.sh
│ │ └── rocm_hipify.py
│ └── sglang/
│ └── pyproject.toml
├── LICENSE
├── README.md
├── benchmark/
│ ├── asr/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── bench_attention_sink/
│ │ └── bench_attention_sink_triton.py
│ ├── bench_in_batch_prefix/
│ │ └── bench_in_batch_prefix.py
│ ├── bench_linear_attention/
│ │ ├── bench_gdn_decode.py
│ │ └── bench_gdn_prefill.py
│ ├── bench_rope/
│ │ └── benchmark_rope_index.py
│ ├── benchmark_batch/
│ │ ├── benchmark_batch.py
│ │ └── benchmark_tokenizer.py
│ ├── benchmark_vllm_060/
│ │ └── README.md
│ ├── blog_v0_2/
│ │ ├── 405b_sglang.sh
│ │ ├── 405b_trt.sh
│ │ ├── 405b_vllm.sh
│ │ ├── README.md
│ │ └── config.md
│ ├── boolq/
│ │ ├── README.md
│ │ ├── bench_sglang.py
│ │ ├── convert_parquet_to_json.py
│ │ └── parquet_to_json.sh
│ ├── ceval/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── deepseek_v3/
│ │ └── README.md
│ ├── dspy/
│ │ ├── README.md
│ │ └── bench_dspy_intro.py
│ ├── fla/
│ │ └── benchmark_layernorm_gated.py
│ ├── generative_agents/
│ │ ├── README.md
│ │ ├── agent_functions.py
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── gpt_oss/
│ │ └── README.md
│ ├── gsm8k/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── hellaswag/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── hf3fs/
│ │ ├── bench.sh
│ │ ├── bench_client.py
│ │ ├── bench_storage.py
│ │ └── bench_zerocopy.py
│ ├── hicache/
│ │ ├── README.md
│ │ ├── bench_long_context.py
│ │ ├── bench_mix.py
│ │ ├── bench_mix.sh
│ │ ├── bench_multiturn.py
│ │ ├── bench_serving.py
│ │ ├── data_processing.py
│ │ ├── download.sh
│ │ ├── nextqa.py
│ │ └── perf.py
│ ├── json_decode_regex/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── json_jump_forward/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── build_dataset.py
│ │ └── dataset.txt
│ ├── json_schema/
│ │ ├── README.md
│ │ └── bench_sglang.py
│ ├── kernels/
│ │ ├── all_reduce/
│ │ │ ├── benchmark_aiter.py
│ │ │ ├── benchmark_all_reduce.py
│ │ │ ├── benchmark_fused_ar_rms_amd.py
│ │ │ ├── benchmark_mscclpp.py
│ │ │ └── benchmark_torch_symm_mem.py
│ │ ├── decoding_attention_triton/
│ │ │ └── triton_flashinfer_cudnn.py
│ │ ├── deepep/
│ │ │ ├── deepep_utils.py
│ │ │ └── tuning_deepep.py
│ │ ├── deepseek/
│ │ │ ├── README.md
│ │ │ ├── benchmark_deepgemm_fp8_gemm.py
│ │ │ ├── benchmark_deepgemm_fp8_gemm_blackwell.py
│ │ │ └── benchmark_deepgemm_fp8_group_gemm.py
│ │ ├── elementwise/
│ │ │ └── benchmark_concat_mla.py
│ │ ├── flashinfer_allreduce_fusion/
│ │ │ ├── README.md
│ │ │ └── benchmark_fused_collective.py
│ │ ├── fused_moe_triton/
│ │ │ ├── README.md
│ │ │ ├── benchmark_sglang_fused_moe_triton.py
│ │ │ ├── benchmark_torch_compile_fused_moe.py
│ │ │ ├── benchmark_vllm_vs_sglang_fused_moe_triton.py
│ │ │ ├── common_utils.py
│ │ │ ├── tuning_client.py
│ │ │ ├── tuning_fused_moe_triton.py
│ │ │ ├── tuning_fused_moe_triton_sep.py
│ │ │ └── tuning_text.json
│ │ ├── quantization/
│ │ │ ├── README.md
│ │ │ ├── bench_fp4_quant.py
│ │ │ ├── bench_int8_quant.py
│ │ │ └── tuning_block_wise_kernel.py
│ │ ├── scheduler_batch/
│ │ │ ├── benchmark_get_last_loc_triton.py
│ │ │ └── benchmark_write_req_to_token_pool_triton.py
│ │ └── sliding_window_attention_triton/
│ │ └── bench_triton_swa_kernel.py
│ ├── line_retrieval/
│ │ ├── README.md
│ │ ├── bench_sglang.py
│ │ └── gen_data.py
│ ├── llava_bench/
│ │ ├── README.md
│ │ ├── bench_hf_llava_bench.sh
│ │ ├── bench_hf_mme.sh
│ │ ├── bench_sglang.py
│ │ ├── bench_sglang_mme.sh
│ │ └── download_images.py
│ ├── llm_judge/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── long_json_decode/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── lora/
│ │ ├── launch_server.py
│ │ └── lora_bench.py
│ ├── mmlu/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── download_data.sh
│ ├── mmmu/
│ │ ├── README.md
│ │ ├── bench_hf.py
│ │ ├── bench_sglang.py
│ │ ├── data_utils.py
│ │ ├── eval_utils.py
│ │ └── prompt_format.yaml
│ ├── mtbench/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── bench_sglang_eagle.py
│ ├── multi_chain_reasoning/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── multi_document_qa/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── build_dataset.py
│ ├── multi_turn_chat/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── data_gen.py
│ │ └── long_prompt_multi_turn.py
│ ├── prefill_only/
│ │ ├── bench_embeddings.py
│ │ ├── bench_score.py
│ │ └── util.py
│ ├── react/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ └── bench_sglang.py
│ ├── reasoning_benchmark/
│ │ ├── README.md
│ │ ├── answer_extraction.py
│ │ ├── bench_sglang.py
│ │ └── eval_utils.py
│ ├── tip_suggestion/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ ├── lmql_funcs.py
│ │ └── topic.jsonl
│ ├── tree_of_thought_deep/
│ │ ├── README.md
│ │ ├── bench_other.py
│ │ ├── bench_sglang.py
│ │ └── lmql_funcs.py
│ └── tree_of_thought_v0/
│ ├── README.md
│ ├── bench_other.py
│ └── bench_sglang.py
├── docker/
│ ├── Dockerfile
│ ├── compose.yaml
│ ├── configs/
│ │ ├── .zshrc
│ │ ├── opt/
│ │ │ ├── .gitconfig
│ │ │ ├── .tmux.conf
│ │ │ └── .vimrc
│ │ └── yank
│ ├── diffusion.Dockerfile
│ ├── gateway.Dockerfile
│ ├── k8s-sglang-distributed-sts.yaml
│ ├── k8s-sglang-service.yaml
│ ├── npu.Dockerfile
│ ├── rocm.Dockerfile
│ ├── sagemaker.Dockerfile
│ ├── serve
│ ├── xeon.Dockerfile
│ └── xpu.Dockerfile
├── docs/
│ ├── Makefile
│ ├── README.md
│ ├── _static/
│ │ └── css/
│ │ ├── custom_log.css
│ │ └── readthedocs.css
│ ├── advanced_features/
│ │ ├── attention_backend.md
│ │ ├── checkpoint_engine.md
│ │ ├── cuda_graph_for_multi_modal_encoder.md
│ │ ├── deterministic_inference.md
│ │ ├── dp_dpa_smg_guide.md
│ │ ├── dp_for_multi_modal_encoder.md
│ │ ├── epd_disaggregation.md
│ │ ├── expert_parallelism.md
│ │ ├── forward_hooks.md
│ │ ├── hicache.rst
│ │ ├── hicache_best_practices.md
│ │ ├── hicache_design.md
│ │ ├── hicache_storage_runtime_attach_detach.md
│ │ ├── hyperparameter_tuning.md
│ │ ├── lora.ipynb
│ │ ├── observability.md
│ │ ├── pd_disaggregation.md
│ │ ├── piecewise_cuda_graph.md
│ │ ├── pipeline_parallelism.md
│ │ ├── quantization.md
│ │ ├── quantized_kv_cache.md
│ │ ├── rfork.md
│ │ ├── separate_reasoning.ipynb
│ │ ├── server_arguments.md
│ │ ├── sgl_model_gateway.md
│ │ ├── sglang_for_rl.md
│ │ ├── speculative_decoding.md
│ │ ├── structured_outputs.ipynb
│ │ ├── structured_outputs_for_reasoning_models.ipynb
│ │ ├── tool_parser.ipynb
│ │ └── vlm_query.ipynb
│ ├── basic_usage/
│ │ ├── deepseek_ocr.md
│ │ ├── deepseek_v3.md
│ │ ├── deepseek_v32.md
│ │ ├── glm45.md
│ │ ├── glmv.md
│ │ ├── gpt_oss.md
│ │ ├── llama4.md
│ │ ├── minimax_m2.md
│ │ ├── native_api.ipynb
│ │ ├── offline_engine_api.ipynb
│ │ ├── ollama_api.md
│ │ ├── openai_api.rst
│ │ ├── openai_api_completions.ipynb
│ │ ├── openai_api_embeddings.ipynb
│ │ ├── openai_api_vision.ipynb
│ │ ├── popular_model_usage.rst
│ │ ├── qwen3.md
│ │ ├── qwen3_5.md
│ │ ├── qwen3_vl.md
│ │ ├── sampling_params.md
│ │ └── send_request.ipynb
│ ├── conf.py
│ ├── deploy.py
│ ├── developer_guide/
│ │ ├── bench_serving.md
│ │ ├── benchmark_and_profiling.md
│ │ ├── contribution_guide.md
│ │ ├── development_guide_using_docker.md
│ │ ├── development_jit_kernel_guide.md
│ │ ├── evaluating_new_models.md
│ │ ├── release_process.md
│ │ └── setup_github_runner.md
│ ├── diffusion/
│ │ ├── api/
│ │ │ ├── cli.md
│ │ │ ├── openai_api.md
│ │ │ └── post_processing.md
│ │ ├── ci_perf.md
│ │ ├── compatibility_matrix.md
│ │ ├── contributing.md
│ │ ├── environment_variables.md
│ │ ├── index.md
│ │ ├── installation.md
│ │ ├── performance/
│ │ │ ├── attention_backends.md
│ │ │ ├── cache/
│ │ │ │ ├── cache_dit.md
│ │ │ │ ├── index.md
│ │ │ │ └── teacache.md
│ │ │ ├── index.md
│ │ │ └── profiling.md
│ │ └── support_new_models.md
│ ├── get_started/
│ │ └── install.md
│ ├── index.rst
│ ├── performance_dashboard/
│ │ ├── README.md
│ │ ├── app.js
│ │ ├── fetch_metrics.py
│ │ ├── index.html
│ │ └── server.py
│ ├── platforms/
│ │ ├── amd_gpu.md
│ │ ├── apple_metal.md
│ │ ├── ascend_contribution_guide.md
│ │ ├── ascend_npu.md
│ │ ├── ascend_npu_best_practice.md
│ │ ├── ascend_npu_deepseek_example.md
│ │ ├── ascend_npu_environment_variables.md
│ │ ├── ascend_npu_glm5_examples.md
│ │ ├── ascend_npu_quantization.md
│ │ ├── ascend_npu_qwen3_5_examples.md
│ │ ├── ascend_npu_qwen3_examples.md
│ │ ├── ascend_npu_support.rst
│ │ ├── ascend_npu_support_features.md
│ │ ├── ascend_npu_support_models.md
│ │ ├── cpu_server.md
│ │ ├── mindspore_backend.md
│ │ ├── mthreads_gpu.md
│ │ ├── nvidia_jetson.md
│ │ ├── tpu.md
│ │ └── xpu.md
│ ├── references/
│ │ ├── custom_chat_template.md
│ │ ├── environment_variables.md
│ │ ├── faq.md
│ │ ├── frontend/
│ │ │ ├── choices_methods.md
│ │ │ ├── frontend_index.rst
│ │ │ └── frontend_tutorial.ipynb
│ │ ├── learn_more.md
│ │ ├── multi_node_deployment/
│ │ │ ├── deploy_on_k8s.md
│ │ │ ├── lws_pd/
│ │ │ │ ├── lws-examples/
│ │ │ │ │ ├── d-svc.yaml
│ │ │ │ │ ├── d.yaml
│ │ │ │ │ ├── lb.yaml
│ │ │ │ │ ├── p-svc.yaml
│ │ │ │ │ └── p.yaml
│ │ │ │ └── lws_pd_deploy.md
│ │ │ ├── multi_node.md
│ │ │ ├── multi_node_index.rst
│ │ │ └── rbg_pd/
│ │ │ └── deepseekv32_pd.md
│ │ ├── post_training_integration.md
│ │ ├── production_metrics.md
│ │ ├── production_request_trace.md
│ │ ├── release_lookup.rst
│ │ └── torch_compile_cache.md
│ ├── release_lookup/
│ │ ├── README.md
│ │ ├── generate_index.py
│ │ ├── index.html
│ │ └── release_index.json
│ ├── requirements.txt
│ ├── serve.sh
│ ├── supported_models/
│ │ ├── extending/
│ │ │ ├── index.rst
│ │ │ ├── mindspore_models.md
│ │ │ ├── modelscope.md
│ │ │ ├── support_new_models.md
│ │ │ └── transformers_fallback.md
│ │ ├── index.rst
│ │ ├── retrieval_ranking/
│ │ │ ├── classify_models.md
│ │ │ ├── embedding_models.md
│ │ │ ├── index.rst
│ │ │ └── rerank_models.md
│ │ ├── specialized/
│ │ │ ├── index.rst
│ │ │ └── reward_models.md
│ │ └── text_generation/
│ │ ├── diffusion_language_models.md
│ │ ├── generative_models.md
│ │ ├── index.rst
│ │ └── multimodal_language_models.md
│ └── wrap_run_llm.py
├── examples/
│ ├── assets/
│ │ └── .gitignore
│ ├── chat_template/
│ │ ├── qwen3_reranker.jinja
│ │ ├── qwen3_vl_reranker.jinja
│ │ ├── tool_chat_template_deepseekr1.jinja
│ │ ├── tool_chat_template_deepseekv3.jinja
│ │ ├── tool_chat_template_deepseekv31.jinja
│ │ ├── tool_chat_template_deepseekv32.jinja
│ │ ├── tool_chat_template_llama3.1_json.jinja
│ │ ├── tool_chat_template_llama4_pythonic.jinja
│ │ └── vision_template_sarashina_vl.jinja
│ ├── checkpoint_engine/
│ │ └── update.py
│ ├── frontend_language/
│ │ ├── quick_start/
│ │ │ ├── anthropic_example_chat.py
│ │ │ ├── anthropic_example_complete.py
│ │ │ ├── azure_openai_example_chat.py
│ │ │ ├── gemini_example_chat.py
│ │ │ ├── gemini_example_complete.py
│ │ │ ├── gemini_example_multimodal_chat.py
│ │ │ ├── local_example_chat.py
│ │ │ ├── local_example_complete.py
│ │ │ ├── local_example_llava_next.py
│ │ │ ├── openai_example_chat.py
│ │ │ ├── openai_example_complete.py
│ │ │ ├── openai_example_n.py
│ │ │ ├── openai_example_o1.py
│ │ │ ├── openrouter_example_chat.py
│ │ │ ├── together_example_chat.py
│ │ │ └── together_example_complete.py
│ │ └── usage/
│ │ ├── chinese_regex.py
│ │ ├── choices_logprob.py
│ │ ├── cot_decoding.py
│ │ ├── json_decode.py
│ │ ├── json_logprobs.py
│ │ ├── llava_video/
│ │ │ ├── srt_example_llava_v.py
│ │ │ └── srt_example_llava_v.sh
│ │ ├── openai_chat_speculative.py
│ │ ├── openai_speculative.py
│ │ ├── parallel_sample.py
│ │ ├── rag_using_parea/
│ │ │ └── trace_and_evaluate_rag_using_parea.ipynb
│ │ ├── readme_examples.py
│ │ ├── sgl_gen_min_tokens.py
│ │ ├── streaming.py
│ │ └── triton/
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── models/
│ │ └── character_generation/
│ │ ├── 1/
│ │ │ └── model.py
│ │ └── config.pbtxt
│ ├── monitoring/
│ │ ├── README.md
│ │ ├── docker-compose.yaml
│ │ ├── grafana/
│ │ │ ├── dashboards/
│ │ │ │ ├── config/
│ │ │ │ │ └── dashboard.yaml
│ │ │ │ └── json/
│ │ │ │ └── sglang-dashboard.json
│ │ │ └── datasources/
│ │ │ └── datasource.yaml
│ │ ├── opentelemetry.yaml
│ │ ├── prometheus.yaml
│ │ └── tracing_compose.yaml
│ ├── profiler/
│ │ └── nsys_profile_tools/
│ │ ├── README.md
│ │ ├── gputrc2graph.py
│ │ └── sglang_engine_model.json
│ ├── runtime/
│ │ ├── README.md
│ │ ├── engine/
│ │ │ ├── custom_server.py
│ │ │ ├── embedding.py
│ │ │ ├── fastapi_engine_inference.py
│ │ │ ├── launch_engine.py
│ │ │ ├── offline_batch_inference.py
│ │ │ ├── offline_batch_inference_async.py
│ │ │ ├── offline_batch_inference_eagle.py
│ │ │ ├── offline_batch_inference_qwen_1m.py
│ │ │ ├── offline_batch_inference_vlm.py
│ │ │ ├── readme.md
│ │ │ ├── save_remote_state.py
│ │ │ └── save_sharded_state.py
│ │ ├── hidden_states/
│ │ │ ├── hidden_states_engine.py
│ │ │ └── hidden_states_server.py
│ │ ├── lora.py
│ │ ├── multimodal/
│ │ │ ├── llama3_llava_server.py
│ │ │ ├── llava_onevision_server.py
│ │ │ ├── pixtral_server.py
│ │ │ └── qwen_llava_server.py
│ │ ├── multimodal_embedding.py
│ │ ├── openai_chat_with_response_prefill.py
│ │ ├── qwen3_vl_reranker.py
│ │ ├── reward_model.py
│ │ ├── token_in_token_out/
│ │ │ ├── token_in_token_out_llm_engine.py
│ │ │ ├── token_in_token_out_llm_server.py
│ │ │ ├── token_in_token_out_vlm_engine.py
│ │ │ └── token_in_token_out_vlm_server.py
│ │ └── vertex_predict.py
│ ├── sagemaker/
│ │ └── deploy_and_serve_endpoint.py
│ └── usage/
│ └── modelopt_quantize_and_export.py
├── python/
│ ├── pyproject.toml
│ ├── pyproject_cpu.toml
│ ├── pyproject_npu.toml
│ ├── pyproject_other.toml
│ ├── pyproject_xpu.toml
│ └── sglang/
│ ├── README.md
│ ├── __init__.py
│ ├── _mps_stub.py
│ ├── _triton_stub.py
│ ├── bench_offline_throughput.py
│ ├── bench_one_batch.py
│ ├── bench_one_batch_server.py
│ ├── bench_serving.py
│ ├── benchmark/
│ │ ├── __init__.py
│ │ ├── bench_utils.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── custom.py
│ │ │ ├── generated_shared_prefix.py
│ │ │ ├── image.py
│ │ │ ├── mmmu.py
│ │ │ ├── mooncake.py
│ │ │ ├── openai_dataset.py
│ │ │ ├── random.py
│ │ │ └── sharegpt.py
│ │ └── utils.py
│ ├── check_env.py
│ ├── cli/
│ │ ├── __init__.py
│ │ ├── generate.py
│ │ ├── main.py
│ │ ├── serve.py
│ │ └── utils.py
│ ├── compile_deep_gemm.py
│ ├── eval/
│ │ ├── llama3_eval.py
│ │ └── loogle_eval.py
│ ├── global_config.py
│ ├── jit_kernel/
│ │ ├── .clang-format
│ │ ├── __init__.py
│ │ ├── __main__.py
│ │ ├── add_constant.py
│ │ ├── awq_dequantize.py
│ │ ├── awq_marlin_repack.py
│ │ ├── benchmark/
│ │ │ ├── bench_awq_dequantize.py
│ │ │ ├── bench_awq_marlin_moe_repack.py
│ │ │ ├── bench_awq_marlin_repack.py
│ │ │ ├── bench_concat_mla.py
│ │ │ ├── bench_fused_add_rmsnorm.py
│ │ │ ├── bench_fused_norm_scale_shift.py
│ │ │ ├── bench_gptq_marlin.py
│ │ │ ├── bench_gptq_marlin_repack.py
│ │ │ ├── bench_hadamard.py
│ │ │ ├── bench_hicache.py
│ │ │ ├── bench_moe_wna16_marlin.py
│ │ │ ├── bench_norm.py
│ │ │ ├── bench_norm_impls.py
│ │ │ ├── bench_nvfp4_blockwise_moe.py
│ │ │ ├── bench_nvfp4_quant.py
│ │ │ ├── bench_nvfp4_scaled_mm.py
│ │ │ ├── bench_per_tensor_quant_fp8.py
│ │ │ ├── bench_per_token_group_quant_8bit.py
│ │ │ ├── bench_qknorm.py
│ │ │ ├── bench_qknorm_across_heads.py
│ │ │ ├── bench_qwen_image_modulation.py
│ │ │ ├── bench_renorm.py
│ │ │ ├── bench_rmsnorm.py
│ │ │ ├── bench_rope.py
│ │ │ ├── bench_store_cache.py
│ │ │ └── utils.py
│ │ ├── concat_mla.py
│ │ ├── csrc/
│ │ │ ├── add_constant.cuh
│ │ │ ├── diffusion/
│ │ │ │ └── timestep_embedding.cuh
│ │ │ ├── elementwise/
│ │ │ │ ├── concat_mla.cuh
│ │ │ │ ├── fused_add_rmsnorm.cuh
│ │ │ │ ├── fused_metadata_copy.cuh
│ │ │ │ ├── kvcache.cuh
│ │ │ │ ├── pos_enc.cuh
│ │ │ │ ├── qknorm.cuh
│ │ │ │ ├── qknorm_across_heads.cuh
│ │ │ │ ├── rmsnorm.cuh
│ │ │ │ └── rope.cuh
│ │ │ ├── fast-hadamard-transform/
│ │ │ │ ├── code_gen.py
│ │ │ │ ├── fast_hadamard_transform.h
│ │ │ │ ├── fast_hadamard_transform_common.h
│ │ │ │ ├── fast_hadamard_transform_special.h
│ │ │ │ ├── hadamard_jit.cuh
│ │ │ │ └── static_switch.h
│ │ │ ├── gemm/
│ │ │ │ ├── awq_dequantize.cuh
│ │ │ │ ├── marlin/
│ │ │ │ │ ├── awq_marlin_repack.cuh
│ │ │ │ │ ├── dequant.h
│ │ │ │ │ ├── gptq_marlin.cuh
│ │ │ │ │ ├── gptq_marlin_repack.cuh
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── marlin.cuh
│ │ │ │ │ ├── marlin_dtypes.cuh
│ │ │ │ │ └── marlin_template.h
│ │ │ │ ├── marlin_moe/
│ │ │ │ │ ├── kernel.h
│ │ │ │ │ ├── marlin_template.h
│ │ │ │ │ └── moe_wna16_marlin.cuh
│ │ │ │ ├── nvfp4/
│ │ │ │ │ ├── nvfp4_expert_quant.cuh
│ │ │ │ │ ├── nvfp4_quant.cuh
│ │ │ │ │ ├── nvfp4_quant_entry.cuh
│ │ │ │ │ ├── nvfp4_quant_kernels.cuh
│ │ │ │ │ ├── nvfp4_scaled_mm_entry.cuh
│ │ │ │ │ └── nvfp4_scaled_mm_kernels.cuh
│ │ │ │ ├── per_tensor_quant_fp8.cuh
│ │ │ │ └── per_token_group_quant_8bit.cuh
│ │ │ ├── hicache.cuh
│ │ │ ├── lora/
│ │ │ │ └── moe_lora_align_kernel.cu
│ │ │ ├── moe/
│ │ │ │ └── nvfp4_blockwise_moe.cuh
│ │ │ ├── ngram_embedding.cuh
│ │ │ └── nsa/
│ │ │ └── fused_store_index_cache.cuh
│ │ ├── cutedsl_gdn.py
│ │ ├── diffusion/
│ │ │ ├── cutedsl/
│ │ │ │ ├── common/
│ │ │ │ │ ├── norm_fusion.py
│ │ │ │ │ └── reduce.py
│ │ │ │ ├── scale_residual_norm_scale_shift.py
│ │ │ │ └── utils.py
│ │ │ └── triton/
│ │ │ ├── mps_fallback.py
│ │ │ ├── norm.py
│ │ │ ├── npu_fallback.py
│ │ │ ├── rmsnorm_onepass.py
│ │ │ ├── rotary.py
│ │ │ └── scale_shift.py
│ │ ├── flash_attention_v4.py
│ │ ├── fused_metadata_copy.py
│ │ ├── fused_store_index_cache.py
│ │ ├── gptq_marlin.py
│ │ ├── gptq_marlin_repack.py
│ │ ├── hadamard.py
│ │ ├── hicache.py
│ │ ├── include/
│ │ │ └── sgl_kernel/
│ │ │ ├── atomic.cuh
│ │ │ ├── cta.cuh
│ │ │ ├── impl/
│ │ │ │ └── norm.cuh
│ │ │ ├── math.cuh
│ │ │ ├── runtime.cuh
│ │ │ ├── scalar_type.hpp
│ │ │ ├── source_location.h
│ │ │ ├── tensor.h
│ │ │ ├── tile.cuh
│ │ │ ├── type.cuh
│ │ │ ├── utils.cuh
│ │ │ ├── utils.h
│ │ │ ├── vec.cuh
│ │ │ └── warp.cuh
│ │ ├── kvcache.py
│ │ ├── moe_lora_align.py
│ │ ├── moe_wna16_marlin.py
│ │ ├── ngram_embedding.py
│ │ ├── norm.py
│ │ ├── nvfp4.py
│ │ ├── per_tensor_quant_fp8.py
│ │ ├── per_token_group_quant_8bit.py
│ │ ├── rope.py
│ │ ├── tests/
│ │ │ ├── test_add_constant.py
│ │ │ ├── test_awq_dequantize.py
│ │ │ ├── test_awq_marlin_moe_repack.py
│ │ │ ├── test_awq_marlin_repack.py
│ │ │ ├── test_concat_mla.py
│ │ │ ├── test_cutedsl_gdn.py
│ │ │ ├── test_flash_attention_4.py
│ │ │ ├── test_fused_add_rmsnorm.py
│ │ │ ├── test_fused_metadata_copy.py
│ │ │ ├── test_fused_norm_scale_shift.py
│ │ │ ├── test_fused_store_index_cache.py
│ │ │ ├── test_fused_verify_triton_gdn.py
│ │ │ ├── test_gptq_marlin.py
│ │ │ ├── test_gptq_marlin_repack.py
│ │ │ ├── test_hadamard_jit.py
│ │ │ ├── test_moe_lora_align_block_size.py
│ │ │ ├── test_moe_wna16_marlin.py
│ │ │ ├── test_norm_jit.py
│ │ │ ├── test_nvfp4_blockwise_moe.py
│ │ │ ├── test_nvfp4_gemm.py
│ │ │ ├── test_nvfp4_quant.py
│ │ │ ├── test_per_tensor_quant_fp8.py
│ │ │ ├── test_per_token_group_quant_8bit.py
│ │ │ ├── test_pos_enc.py
│ │ │ ├── test_qknorm.py
│ │ │ ├── test_qknorm_across_heads.py
│ │ │ ├── test_qwen_image_modulation.py
│ │ │ ├── test_renorm.py
│ │ │ ├── test_rmsnorm.py
│ │ │ ├── test_rope.py
│ │ │ ├── test_store_cache.py
│ │ │ └── test_timestep_embedding.py
│ │ ├── timestep_embedding.py
│ │ └── utils.py
│ ├── lang/
│ │ ├── api.py
│ │ ├── backend/
│ │ │ ├── anthropic.py
│ │ │ ├── base_backend.py
│ │ │ ├── litellm.py
│ │ │ ├── openai.py
│ │ │ ├── runtime_endpoint.py
│ │ │ └── vertexai.py
│ │ ├── chat_template.py
│ │ ├── choices.py
│ │ ├── interpreter.py
│ │ ├── ir.py
│ │ └── tracer.py
│ ├── launch_server.py
│ ├── multimodal_gen/
│ │ ├── .claude/
│ │ │ ├── CLAUDE.md
│ │ │ └── skills/
│ │ │ ├── diffusion-kernel/
│ │ │ │ ├── SKILL.md
│ │ │ │ ├── add-cuda-kernel.md
│ │ │ │ ├── add-triton-kernel.md
│ │ │ │ ├── diffusion-benchmark-and-profile.md
│ │ │ │ ├── nsight-profiler.md
│ │ │ │ ├── references/
│ │ │ │ │ ├── a100-optimization-guide.md
│ │ │ │ │ ├── h100-optimization-guide.md
│ │ │ │ │ ├── kernel-templates.md
│ │ │ │ │ ├── t4-optimization-guide.md
│ │ │ │ │ └── troubleshooting.md
│ │ │ │ ├── scripts/
│ │ │ │ │ ├── bench_diffusion_denoise.py
│ │ │ │ │ ├── bench_diffusion_rmsnorm.py
│ │ │ │ │ └── diffusion_skill_env.py
│ │ │ │ └── use-efficient-diffusion-kernels.md
│ │ │ ├── diffusion-optimal-perf/
│ │ │ │ └── SKILL.md
│ │ │ └── support-new-model/
│ │ │ └── SKILL.md
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── apps/
│ │ │ ├── ComfyUI_SGLDiffusion/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── core/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generator.py
│ │ │ │ │ ├── model_patcher.py
│ │ │ │ │ └── server_api.py
│ │ │ │ ├── executors/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── nodes.py
│ │ │ │ ├── test/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── test_flux_pipeline.py
│ │ │ │ │ ├── test_qwen_image_edit_pipeline.py
│ │ │ │ │ ├── test_qwen_image_pipeline.py
│ │ │ │ │ └── test_zimage_pipeline.py
│ │ │ │ ├── utils.py
│ │ │ │ └── workflows/
│ │ │ │ ├── flux_sgld_sp.json
│ │ │ │ ├── qwen_image_sgld.json
│ │ │ │ ├── sgld_image2video.json
│ │ │ │ ├── sgld_text2img.json
│ │ │ │ └── z-image_sgld.json
│ │ │ └── webui/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ └── main.py
│ │ ├── benchmarks/
│ │ │ ├── bench_offline_throughput.py
│ │ │ ├── bench_serving.py
│ │ │ ├── compare_perf.py
│ │ │ └── datasets.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── backend/
│ │ │ │ └── vmoba/
│ │ │ │ ├── wan_1.3B_77_448_832.json
│ │ │ │ └── wan_1.3B_77_480_832.json
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adapter/
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── ltx_2_connector.py
│ │ │ │ ├── base.py
│ │ │ │ ├── bridges/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── mova_dual_tower.py
│ │ │ │ ├── dits/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── glmimage.py
│ │ │ │ │ ├── helios.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvideo.py
│ │ │ │ │ ├── ltx_2.py
│ │ │ │ │ ├── mova_audio.py
│ │ │ │ │ ├── mova_video.py
│ │ │ │ │ ├── qwenimage.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ ├── wanvideo.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── clip.py
│ │ │ │ │ ├── gemma2.py
│ │ │ │ │ ├── gemma_3.py
│ │ │ │ │ ├── llama.py
│ │ │ │ │ ├── qwen3.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ └── t5.py
│ │ │ │ ├── vaes/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── dac.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── glmimage.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvae.py
│ │ │ │ │ ├── ltx_audio.py
│ │ │ │ │ ├── ltx_video.py
│ │ │ │ │ ├── qwenimage.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ └── wanvae.py
│ │ │ │ └── vocoder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ └── ltx_vocoder.py
│ │ │ ├── pipeline_configs/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── diffusers_generic.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── flux_finetuned.py
│ │ │ │ ├── glm_image.py
│ │ │ │ ├── helios.py
│ │ │ │ ├── hunyuan.py
│ │ │ │ ├── hunyuan3d.py
│ │ │ │ ├── ltx_2.py
│ │ │ │ ├── mova.py
│ │ │ │ ├── qwen_image.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── wan.py
│ │ │ │ └── zimage.py
│ │ │ ├── quantization.py
│ │ │ ├── sample/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── diffusers_generic.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── glmimage.py
│ │ │ │ ├── helios.py
│ │ │ │ ├── hunyuan.py
│ │ │ │ ├── hunyuan3d.py
│ │ │ │ ├── ltx_2.py
│ │ │ │ ├── mova.py
│ │ │ │ ├── qwenimage.py
│ │ │ │ ├── sampling_params.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── teacache.py
│ │ │ │ ├── wan.py
│ │ │ │ └── zimage.py
│ │ │ └── utils.py
│ │ ├── csrc/
│ │ │ ├── attn/
│ │ │ │ └── vmoba_attn/
│ │ │ │ ├── README.md
│ │ │ │ ├── setup.py
│ │ │ │ ├── tests/
│ │ │ │ │ └── test_vmoba_attn.py
│ │ │ │ └── vmoba/
│ │ │ │ ├── __init__.py
│ │ │ │ └── vmoba.py
│ │ │ └── render/
│ │ │ ├── hunyuan3d_rasterizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── rasterizer.cpp
│ │ │ │ ├── rasterizer.h
│ │ │ │ └── rasterizer_gpu.cu
│ │ │ └── mesh_processor/
│ │ │ ├── __init__.py
│ │ │ └── mesh_processor.cpp
│ │ ├── docs/
│ │ │ └── quantization.md
│ │ ├── envs.py
│ │ ├── registry.py
│ │ ├── runtime/
│ │ │ ├── cache/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cache_dit_integration.py
│ │ │ │ └── teacache.py
│ │ │ ├── distributed/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── communication_op.py
│ │ │ │ ├── device_communicators/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_device_communicator.py
│ │ │ │ │ ├── cpu_communicator.py
│ │ │ │ │ ├── cuda_communicator.py
│ │ │ │ │ ├── pynccl.py
│ │ │ │ │ └── pynccl_wrapper.py
│ │ │ │ ├── group_coordinator.py
│ │ │ │ ├── parallel_groups.py
│ │ │ │ ├── parallel_state.py
│ │ │ │ └── utils.py
│ │ │ ├── entrypoints/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cli/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cli_types.py
│ │ │ │ │ ├── generate.py
│ │ │ │ │ ├── main.py
│ │ │ │ │ ├── serve.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── diffusion_generator.py
│ │ │ │ ├── http_server.py
│ │ │ │ ├── openai/
│ │ │ │ │ ├── common_api.py
│ │ │ │ │ ├── image_api.py
│ │ │ │ │ ├── mesh_api.py
│ │ │ │ │ ├── protocol.py
│ │ │ │ │ ├── storage.py
│ │ │ │ │ ├── stores.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── video_api.py
│ │ │ │ ├── post_training/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── io_struct.py
│ │ │ │ │ └── weights_api.py
│ │ │ │ └── utils.py
│ │ │ ├── launch_server.py
│ │ │ ├── layers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── activation.py
│ │ │ │ ├── attention/
│ │ │ │ │ ├── STA_configuration.py
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── backends/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── aiter.py
│ │ │ │ │ │ ├── aiter_sage.py
│ │ │ │ │ │ ├── attention_backend.py
│ │ │ │ │ │ ├── flash_attn.py
│ │ │ │ │ │ ├── flash_attn_2.py
│ │ │ │ │ │ ├── sage_attn.py
│ │ │ │ │ │ ├── sage_attn3.py
│ │ │ │ │ │ ├── sdpa.py
│ │ │ │ │ │ ├── sliding_tile_attn.py
│ │ │ │ │ │ ├── sparse_linear_attn.py
│ │ │ │ │ │ ├── sparse_video_gen_2_attn.py
│ │ │ │ │ │ ├── video_sparse_attn.py
│ │ │ │ │ │ └── vmoba.py
│ │ │ │ │ ├── layer.py
│ │ │ │ │ ├── selector.py
│ │ │ │ │ └── turbo_layer.py
│ │ │ │ ├── custom_op.py
│ │ │ │ ├── elementwise.py
│ │ │ │ ├── layernorm.py
│ │ │ │ ├── linear.py
│ │ │ │ ├── lora/
│ │ │ │ │ └── linear.py
│ │ │ │ ├── mlp.py
│ │ │ │ ├── quantization/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ ├── base_config.py
│ │ │ │ │ │ └── nunchaku_config.py
│ │ │ │ │ ├── fp8.py
│ │ │ │ │ ├── modelslim.py
│ │ │ │ │ └── nunchaku_linear.py
│ │ │ │ ├── rotary_embedding/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── mrope.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── usp.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── visual_embedding.py
│ │ │ │ └── vocab_parallel_embedding.py
│ │ │ ├── loader/
│ │ │ │ ├── component_loaders/
│ │ │ │ │ ├── adapter_loader.py
│ │ │ │ │ ├── bridge_loader.py
│ │ │ │ │ ├── component_loader.py
│ │ │ │ │ ├── image_encoder_loader.py
│ │ │ │ │ ├── scheduler_loader.py
│ │ │ │ │ ├── text_encoder_loader.py
│ │ │ │ │ ├── transformer_loader.py
│ │ │ │ │ ├── vae_loader.py
│ │ │ │ │ ├── vl_encoder_loader.py
│ │ │ │ │ └── vocoder_loader.py
│ │ │ │ ├── fsdp_load.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── weight_utils.py
│ │ │ │ └── weights_updater.py
│ │ │ ├── managers/
│ │ │ │ ├── forward_context.py
│ │ │ │ ├── gpu_worker.py
│ │ │ │ └── scheduler.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── adapter/
│ │ │ │ │ └── ltx_2_connector.py
│ │ │ │ ├── bridges/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── mova_dual_tower.py
│ │ │ │ ├── dits/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── causal_wanvideo.py
│ │ │ │ │ ├── flux.py
│ │ │ │ │ ├── flux_2.py
│ │ │ │ │ ├── glm_image.py
│ │ │ │ │ ├── helios.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── hunyuanvideo.py
│ │ │ │ │ ├── ltx_2.py
│ │ │ │ │ ├── mova_audio_dit.py
│ │ │ │ │ ├── mova_video_dit.py
│ │ │ │ │ ├── qwen_image.py
│ │ │ │ │ ├── sana.py
│ │ │ │ │ ├── wanvideo.py
│ │ │ │ │ └── zimage.py
│ │ │ │ ├── encoders/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── bert.py
│ │ │ │ │ ├── clip.py
│ │ │ │ │ ├── gemma2.py
│ │ │ │ │ ├── gemma_3.py
│ │ │ │ │ ├── hunyuan3d.py
│ │ │ │ │ ├── llama.py
│ │ │ │ │ ├── mistral_3.py
│ │ │ │ │ ├── qwen2_5vl.py
│ │ │ │ │ ├── qwen3.py
│ │ │ │ │ ├── t5.py
│ │ │ │ │ └── vision.py
│ │ │ │ ├── parameter.py
│ │ │ │ ├── registry.py
│ │ │ │ ├── schedulers/
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── flow_match_pair.py
│ │ │ │ │ ├── hunyuan3d_scheduler.py
│ │ │ │ │ ├── scheduling_comfyui_passthrough.py
│ │ │ │ │ ├── scheduling_dpm_solver_multistep.py
│ │ │ │ │ ├── scheduling_flow_match_euler_discrete.py
│ │ │ │ │ ├── scheduling_flow_unipc_multistep.py
│ │ │ │ │ ├── scheduling_helios.py
│ │ │ │ │ ├── scheduling_self_forcing_flow_match.py
│ │ │ │ │ └── scheduling_unipc_multistep.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── vaes/
│ │ │ │ │ ├── autoencoder.py
│ │ │ │ │ ├── autoencoder_dc.py
│ │ │ │ │ ├── autoencoder_kl_flux2.py
│ │ │ │ │ ├── autoencoder_kl_qwenimage.py
│ │ │ │ │ ├── common.py
│ │ │ │ │ ├── dac.py
│ │ │ │ │ ├── hunyuan3d_vae.py
│ │ │ │ │ ├── hunyuanvae.py
│ │ │ │ │ ├── ltx_2_audio.py
│ │ │ │ │ ├── ltx_2_vae.py
│ │ │ │ │ ├── parallel/
│ │ │ │ │ │ ├── wan_common_utils.py
│ │ │ │ │ │ └── wan_dist_utils.py
│ │ │ │ │ └── wanvae.py
│ │ │ │ ├── vision_utils.py
│ │ │ │ └── vocoder/
│ │ │ │ └── ltx_2_vocoder.py
│ │ │ ├── pipelines/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── comfyui_flux_pipeline.py
│ │ │ │ ├── comfyui_qwen_image_pipeline.py
│ │ │ │ ├── comfyui_zimage_pipeline.py
│ │ │ │ ├── diffusers_pipeline.py
│ │ │ │ ├── flux.py
│ │ │ │ ├── flux_2.py
│ │ │ │ ├── flux_2_klein.py
│ │ │ │ ├── glm_image.py
│ │ │ │ ├── helios_pipeline.py
│ │ │ │ ├── hunyuan3d_pipeline.py
│ │ │ │ ├── hunyuan_pipeline.py
│ │ │ │ ├── ltx_2_pipeline.py
│ │ │ │ ├── mova_pipeline.py
│ │ │ │ ├── qwen_image.py
│ │ │ │ ├── sana.py
│ │ │ │ ├── wan_causal_dmd_pipeline.py
│ │ │ │ ├── wan_dmd_pipeline.py
│ │ │ │ ├── wan_i2v_dmd_pipeline.py
│ │ │ │ ├── wan_i2v_pipeline.py
│ │ │ │ ├── wan_pipeline.py
│ │ │ │ └── zimage_pipeline.py
│ │ │ ├── pipelines_core/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── composed_pipeline_base.py
│ │ │ │ ├── executors/
│ │ │ │ │ ├── parallel_executor.py
│ │ │ │ │ ├── pipeline_executor.py
│ │ │ │ │ └── sync_executor.py
│ │ │ │ ├── lora_format_adapter.py
│ │ │ │ ├── lora_pipeline.py
│ │ │ │ ├── schedule_batch.py
│ │ │ │ └── stages/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── causal_denoising.py
│ │ │ │ ├── comfyui_latent_preparation.py
│ │ │ │ ├── decoding.py
│ │ │ │ ├── decoding_av.py
│ │ │ │ ├── denoising.py
│ │ │ │ ├── denoising_av.py
│ │ │ │ ├── denoising_dmd.py
│ │ │ │ ├── encoding.py
│ │ │ │ ├── hunyuan3d_paint.py
│ │ │ │ ├── hunyuan3d_shape.py
│ │ │ │ ├── image_encoding.py
│ │ │ │ ├── input_validation.py
│ │ │ │ ├── latent_preparation.py
│ │ │ │ ├── latent_preparation_av.py
│ │ │ │ ├── model_specific_stages/
│ │ │ │ │ ├── glm_image.py
│ │ │ │ │ ├── helios_decoding.py
│ │ │ │ │ ├── helios_denoising.py
│ │ │ │ │ ├── mova.py
│ │ │ │ │ └── qwen_image_layered.py
│ │ │ │ ├── text_connector.py
│ │ │ │ ├── text_encoding.py
│ │ │ │ ├── timestep_preparation.py
│ │ │ │ └── validators.py
│ │ │ ├── platforms/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cpu.py
│ │ │ │ ├── cuda.py
│ │ │ │ ├── interface.py
│ │ │ │ ├── mps.py
│ │ │ │ ├── musa.py
│ │ │ │ ├── npu.py
│ │ │ │ └── rocm.py
│ │ │ ├── postprocess/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── realesrgan_upscaler.py
│ │ │ │ └── rife_interpolator.py
│ │ │ ├── scheduler_client.py
│ │ │ ├── server_args.py
│ │ │ └── utils/
│ │ │ ├── common.py
│ │ │ ├── distributed.py
│ │ │ ├── hf_diffusers_utils.py
│ │ │ ├── layerwise_offload.py
│ │ │ ├── logging_utils.py
│ │ │ ├── mesh3d_utils.py
│ │ │ ├── perf_logger.py
│ │ │ ├── profiler.py
│ │ │ └── quantization_utils.py
│ │ ├── test/
│ │ │ ├── __init__.py
│ │ │ ├── cli/
│ │ │ │ ├── test_generate_common.py
│ │ │ │ ├── test_generate_i2i.py
│ │ │ │ └── test_generate_t2i_perf.py
│ │ │ ├── run_suite.py
│ │ │ ├── scripts/
│ │ │ │ ├── gen_diffusion_ci_outputs.py
│ │ │ │ └── gen_perf_baselines.py
│ │ │ ├── server/
│ │ │ │ ├── ascend/
│ │ │ │ │ ├── perf_baselines_npu.json
│ │ │ │ │ ├── test_server_1_npu.py
│ │ │ │ │ ├── test_server_2_npu.py
│ │ │ │ │ ├── test_server_8_npu.py
│ │ │ │ │ └── testcase_configs_npu.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── perf_baselines.json
│ │ │ │ ├── test_server_2_gpu_a.py
│ │ │ │ ├── test_server_2_gpu_b.py
│ │ │ │ ├── test_server_a.py
│ │ │ │ ├── test_server_b.py
│ │ │ │ ├── test_server_common.py
│ │ │ │ ├── test_server_utils.py
│ │ │ │ ├── test_update_weights_from_disk.py
│ │ │ │ └── testcase_configs.py
│ │ │ ├── slack_utils.py
│ │ │ ├── test_files/
│ │ │ │ ├── launch_flux.json
│ │ │ │ └── launch_wan.json
│ │ │ ├── test_utils.py
│ │ │ └── unit/
│ │ │ ├── test_lora_format_adapter.py
│ │ │ ├── test_sampling_params.py
│ │ │ ├── test_server_args.py
│ │ │ └── test_storage.py
│ │ ├── third_party/
│ │ │ ├── __init__.py
│ │ │ └── pynvml.py
│ │ ├── tools/
│ │ │ ├── convert_hf_to_fp8.py
│ │ │ └── wan_repack.py
│ │ └── utils.py
│ ├── profiler.py
│ ├── srt/
│ │ ├── batch_invariant_ops/
│ │ │ ├── __init__.py
│ │ │ └── batch_invariant_ops.py
│ │ ├── batch_overlap/
│ │ │ ├── operations.py
│ │ │ ├── operations_strategy.py
│ │ │ ├── single_batch_overlap.py
│ │ │ └── two_batch_overlap.py
│ │ ├── checkpoint_engine/
│ │ │ ├── __init__.py
│ │ │ ├── checkpoint_engine_worker.py
│ │ │ └── update.py
│ │ ├── compilation/
│ │ │ ├── backend.py
│ │ │ ├── compilation_config.py
│ │ │ ├── compilation_counter.py
│ │ │ ├── compile.py
│ │ │ ├── compiler_interface.py
│ │ │ ├── cuda_piecewise_backend.py
│ │ │ ├── fix_functionalization.py
│ │ │ ├── fx_utils.py
│ │ │ ├── inductor_pass.py
│ │ │ ├── npu_piecewise_backend.py
│ │ │ ├── pass_manager.py
│ │ │ ├── piecewise_context_manager.py
│ │ │ └── weak_ref_tensor.py
│ │ ├── configs/
│ │ │ ├── __init__.py
│ │ │ ├── afmoe.py
│ │ │ ├── bailing_hybrid.py
│ │ │ ├── chatglm.py
│ │ │ ├── dbrx.py
│ │ │ ├── deepseek_ocr.py
│ │ │ ├── deepseekvl2.py
│ │ │ ├── device_config.py
│ │ │ ├── dots_ocr.py
│ │ │ ├── dots_vlm.py
│ │ │ ├── exaone.py
│ │ │ ├── falcon_h1.py
│ │ │ ├── granitemoehybrid.py
│ │ │ ├── internvl.py
│ │ │ ├── janus_pro.py
│ │ │ ├── jet_nemotron.py
│ │ │ ├── jet_vlm.py
│ │ │ ├── kimi_k25.py
│ │ │ ├── kimi_linear.py
│ │ │ ├── kimi_vl.py
│ │ │ ├── kimi_vl_moonvit.py
│ │ │ ├── lfm2.py
│ │ │ ├── lfm2_moe.py
│ │ │ ├── load_config.py
│ │ │ ├── longcat_flash.py
│ │ │ ├── mamba_utils.py
│ │ │ ├── model_config.py
│ │ │ ├── modelopt_config.py
│ │ │ ├── nano_nemotron_vl.py
│ │ │ ├── nemotron_h.py
│ │ │ ├── olmo3.py
│ │ │ ├── points_v15_chat.py
│ │ │ ├── qwen3_5.py
│ │ │ ├── qwen3_next.py
│ │ │ ├── qwen3_omni.py
│ │ │ ├── qwen3_vl.py
│ │ │ ├── radio.py
│ │ │ ├── step3_vl.py
│ │ │ ├── step3p5.py
│ │ │ ├── update_config.py
│ │ │ └── utils.py
│ │ ├── connector/
│ │ │ ├── __init__.py
│ │ │ ├── base_connector.py
│ │ │ ├── redis.py
│ │ │ ├── remote_instance.py
│ │ │ ├── s3.py
│ │ │ ├── serde/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── safe_serde.py
│ │ │ │ └── serde.py
│ │ │ └── utils.py
│ │ ├── constants.py
│ │ ├── constrained/
│ │ │ ├── base_grammar_backend.py
│ │ │ ├── grammar_manager.py
│ │ │ ├── llguidance_backend.py
│ │ │ ├── outlines_backend.py
│ │ │ ├── outlines_jump_forward.py
│ │ │ ├── reasoner_grammar_backend.py
│ │ │ ├── triton_ops/
│ │ │ │ └── bitmask_ops.py
│ │ │ ├── utils.py
│ │ │ └── xgrammar_backend.py
│ │ ├── debug_utils/
│ │ │ ├── __init__.py
│ │ │ ├── comparator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── aligner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── axis_aligner.py
│ │ │ │ │ ├── entrypoint/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ ├── traced_types.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ ├── reorderer/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ ├── token_aligner/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── concat_steps/
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ │ └── thd_seq_lens_loader.py
│ │ │ │ │ │ ├── entrypoint.py
│ │ │ │ │ │ └── smart/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── aux_loader.py
│ │ │ │ │ │ ├── aux_plugins.py
│ │ │ │ │ │ ├── executor.py
│ │ │ │ │ │ ├── planner.py
│ │ │ │ │ │ ├── seq_info_builder.py
│ │ │ │ │ │ └── types.py
│ │ │ │ │ └── unsharder/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── executor.py
│ │ │ │ │ ├── parallel_info.py
│ │ │ │ │ ├── planner.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── bundle_comparator.py
│ │ │ │ ├── bundle_matcher.py
│ │ │ │ ├── dims_spec/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── comment_parser.py
│ │ │ │ │ ├── dim_parser.py
│ │ │ │ │ ├── dims_parser.py
│ │ │ │ │ ├── modifier_parser.py
│ │ │ │ │ ├── tensor_naming.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── display.py
│ │ │ │ ├── dp_utils.py
│ │ │ │ ├── entrypoint.py
│ │ │ │ ├── log_sink.py
│ │ │ │ ├── meta_overrider.py
│ │ │ │ ├── output_formatter.py
│ │ │ │ ├── output_types.py
│ │ │ │ ├── per_token_visualizer.py
│ │ │ │ ├── preset.py
│ │ │ │ ├── report_sink.py
│ │ │ │ ├── tensor_comparator/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── comparator.py
│ │ │ │ │ ├── formatter.py
│ │ │ │ │ └── types.py
│ │ │ │ ├── utils.py
│ │ │ │ └── visualizer/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── figure.py
│ │ │ │ ├── panels.py
│ │ │ │ └── preprocessing.py
│ │ │ ├── cuda_coredump.py
│ │ │ ├── dump_comparator.py
│ │ │ ├── dump_loader.py
│ │ │ ├── dumper.py
│ │ │ ├── log_parser.py
│ │ │ ├── model_truncator.py
│ │ │ ├── schedule_simulator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── data_source/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── data_loader.py
│ │ │ │ │ └── data_synthesis.py
│ │ │ │ ├── entrypoint.py
│ │ │ │ ├── gpu_state.py
│ │ │ │ ├── metrics.py
│ │ │ │ ├── request.py
│ │ │ │ ├── routers/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── random_router.py
│ │ │ │ │ ├── round_robin_router.py
│ │ │ │ │ └── sticky_router.py
│ │ │ │ ├── schedulers/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── fifo_scheduler.py
│ │ │ │ └── simulator.py
│ │ │ ├── source_patcher/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── code_patcher.py
│ │ │ │ ├── source_editor.py
│ │ │ │ └── types.py
│ │ │ ├── tensor_dump_forward_hook.py
│ │ │ └── text_comparator.py
│ │ ├── disaggregation/
│ │ │ ├── ascend/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── transfer_engine.py
│ │ │ ├── base/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── common/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── utils.py
│ │ │ ├── decode.py
│ │ │ ├── decode_kvcache_offload_manager.py
│ │ │ ├── decode_schedule_batch_mixin.py
│ │ │ ├── encode_grpc_server.py
│ │ │ ├── encode_receiver.py
│ │ │ ├── encode_server.py
│ │ │ ├── fake/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── kv_events.py
│ │ │ ├── mooncake/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── utils.py
│ │ │ ├── mori/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── nixl/
│ │ │ │ ├── __init__.py
│ │ │ │ └── conn.py
│ │ │ ├── prefill.py
│ │ │ └── utils.py
│ │ ├── distributed/
│ │ │ ├── __init__.py
│ │ │ ├── communication_op.py
│ │ │ ├── device_communicators/
│ │ │ │ ├── all_reduce_utils.py
│ │ │ │ ├── cuda_wrapper.py
│ │ │ │ ├── custom_all_reduce.py
│ │ │ │ ├── custom_all_reduce_ops.py
│ │ │ │ ├── custom_all_reduce_utils.py
│ │ │ │ ├── hpu_communicator.py
│ │ │ │ ├── mooncake_transfer_engine.py
│ │ │ │ ├── npu_communicator.py
│ │ │ │ ├── pymscclpp.py
│ │ │ │ ├── pynccl.py
│ │ │ │ ├── pynccl_allocator.py
│ │ │ │ ├── pynccl_wrapper.py
│ │ │ │ ├── quick_all_reduce.py
│ │ │ │ ├── shm_broadcast.py
│ │ │ │ ├── torch_symm_mem.py
│ │ │ │ └── xpu_communicator.py
│ │ │ ├── naive_distributed.py
│ │ │ ├── parallel_state.py
│ │ │ └── utils.py
│ │ ├── dllm/
│ │ │ ├── algorithm/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── joint_threshold.py
│ │ │ │ └── low_confidence.py
│ │ │ ├── config.py
│ │ │ └── mixin/
│ │ │ ├── req.py
│ │ │ └── scheduler.py
│ │ ├── elastic_ep/
│ │ │ ├── elastic_ep.py
│ │ │ ├── expert_backup_client.py
│ │ │ └── expert_backup_manager.py
│ │ ├── entrypoints/
│ │ │ ├── EngineBase.py
│ │ │ ├── anthropic/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── protocol.py
│ │ │ │ └── serving.py
│ │ │ ├── context.py
│ │ │ ├── engine.py
│ │ │ ├── grpc_server.py
│ │ │ ├── harmony_utils.py
│ │ │ ├── http_server.py
│ │ │ ├── http_server_engine.py
│ │ │ ├── ollama/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── protocol.py
│ │ │ │ ├── serving.py
│ │ │ │ └── smart_router.py
│ │ │ ├── openai/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── encoding_dsv32.py
│ │ │ │ ├── protocol.py
│ │ │ │ ├── serving_base.py
│ │ │ │ ├── serving_chat.py
│ │ │ │ ├── serving_classify.py
│ │ │ │ ├── serving_completions.py
│ │ │ │ ├── serving_embedding.py
│ │ │ │ ├── serving_rerank.py
│ │ │ │ ├── serving_responses.py
│ │ │ │ ├── serving_score.py
│ │ │ │ ├── serving_tokenize.py
│ │ │ │ ├── serving_transcription.py
│ │ │ │ ├── tool_server.py
│ │ │ │ ├── usage_processor.py
│ │ │ │ └── utils.py
│ │ │ ├── ssl_utils.py
│ │ │ ├── tool.py
│ │ │ ├── v1_loads.py
│ │ │ └── warmup.py
│ │ ├── environ.py
│ │ ├── eplb/
│ │ │ ├── __init__.py
│ │ │ ├── eplb_algorithms/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── deepseek.py
│ │ │ │ ├── deepseek_vec.py
│ │ │ │ └── elasticity_aware.py
│ │ │ ├── eplb_manager.py
│ │ │ ├── eplb_simulator/
│ │ │ │ ├── __init__.py
│ │ │ │ └── reader.py
│ │ │ ├── expert_distribution.py
│ │ │ ├── expert_location.py
│ │ │ ├── expert_location_dispatch.py
│ │ │ └── expert_location_updater.py
│ │ ├── function_call/
│ │ │ ├── base_format_detector.py
│ │ │ ├── core_types.py
│ │ │ ├── deepseekv31_detector.py
│ │ │ ├── deepseekv32_detector.py
│ │ │ ├── deepseekv3_detector.py
│ │ │ ├── function_call_parser.py
│ │ │ ├── gigachat3_detector.py
│ │ │ ├── glm47_moe_detector.py
│ │ │ ├── glm4_moe_detector.py
│ │ │ ├── gpt_oss_detector.py
│ │ │ ├── hermes_detector.py
│ │ │ ├── internlm_detector.py
│ │ │ ├── json_array_parser.py
│ │ │ ├── kimik2_detector.py
│ │ │ ├── lfm2_detector.py
│ │ │ ├── llama32_detector.py
│ │ │ ├── mimo_detector.py
│ │ │ ├── minimax_m2.py
│ │ │ ├── mistral_detector.py
│ │ │ ├── pythonic_detector.py
│ │ │ ├── qwen25_detector.py
│ │ │ ├── qwen3_coder_detector.py
│ │ │ ├── step3_detector.py
│ │ │ ├── trinity_detector.py
│ │ │ └── utils.py
│ │ ├── grpc/
│ │ │ └── __init__.py
│ │ ├── hardware_backend/
│ │ │ └── npu/
│ │ │ ├── allocator_npu.py
│ │ │ ├── attention/
│ │ │ │ ├── ascend_backend.py
│ │ │ │ ├── ascend_torch_native_backend.py
│ │ │ │ └── mla_preprocess.py
│ │ │ ├── cmo.py
│ │ │ ├── graph_runner/
│ │ │ │ ├── eagle_draft_extend_npu_graph_runner.py
│ │ │ │ ├── eagle_draft_npu_graph_runner.py
│ │ │ │ ├── npu_graph_runner.py
│ │ │ │ └── vit_npu_graph_runner.py
│ │ │ ├── memory_pool_npu.py
│ │ │ ├── modules/
│ │ │ │ ├── deepseek_v2_attention_mla_npu.py
│ │ │ │ └── qwen_vl_processor.py
│ │ │ ├── moe/
│ │ │ │ └── topk.py
│ │ │ ├── quantization/
│ │ │ │ ├── fused_moe_method_npu.py
│ │ │ │ └── linear_method_npu.py
│ │ │ └── utils.py
│ │ ├── layers/
│ │ │ ├── activation.py
│ │ │ ├── amx_utils.py
│ │ │ ├── attention/
│ │ │ │ ├── aiter_backend.py
│ │ │ │ ├── attention_registry.py
│ │ │ │ ├── base_attn_backend.py
│ │ │ │ ├── cutlass_mla_backend.py
│ │ │ │ ├── double_sparsity_backend.py
│ │ │ │ ├── dual_chunk_flashattention_backend.py
│ │ │ │ ├── fla/
│ │ │ │ │ ├── chunk.py
│ │ │ │ │ ├── chunk_delta_h.py
│ │ │ │ │ ├── chunk_o.py
│ │ │ │ │ ├── chunk_scaled_dot_kkt.py
│ │ │ │ │ ├── cumsum.py
│ │ │ │ │ ├── fused_gdn_gating.py
│ │ │ │ │ ├── fused_norm_gate.py
│ │ │ │ │ ├── fused_recurrent.py
│ │ │ │ │ ├── fused_sigmoid_gating_recurrent.py
│ │ │ │ │ ├── index.py
│ │ │ │ │ ├── kda.py
│ │ │ │ │ ├── l2norm.py
│ │ │ │ │ ├── layernorm_gated.py
│ │ │ │ │ ├── op.py
│ │ │ │ │ ├── solve_tril.py
│ │ │ │ │ ├── utils.py
│ │ │ │ │ └── wy_fast.py
│ │ │ │ ├── flashattention_backend.py
│ │ │ │ ├── flashinfer_backend.py
│ │ │ │ ├── flashinfer_mla_backend.py
│ │ │ │ ├── flashmla_backend.py
│ │ │ │ ├── hybrid_attn_backend.py
│ │ │ │ ├── hybrid_linear_attn_backend.py
│ │ │ │ ├── intel_amx_backend.py
│ │ │ │ ├── linear/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── gdn_backend.py
│ │ │ │ │ ├── kda_backend.py
│ │ │ │ │ ├── kernels/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── gdn_cutedsl.py
│ │ │ │ │ │ ├── gdn_flashinfer.py
│ │ │ │ │ │ ├── gdn_triton.py
│ │ │ │ │ │ ├── kda_triton.py
│ │ │ │ │ │ └── kernel_backend.py
│ │ │ │ │ ├── lightning_attn.py
│ │ │ │ │ ├── lightning_backend.py
│ │ │ │ │ ├── linear_metadata.py
│ │ │ │ │ ├── seg_la.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── mamba/
│ │ │ │ │ ├── causal_conv1d.py
│ │ │ │ │ ├── causal_conv1d_triton.py
│ │ │ │ │ ├── mamba.py
│ │ │ │ │ ├── mamba2_metadata.py
│ │ │ │ │ ├── mamba_state_scatter_triton.py
│ │ │ │ │ ├── mixer2_rms_norm_gated.py
│ │ │ │ │ └── ops/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── layernorm_gated.py
│ │ │ │ │ ├── mamba_ssm.py
│ │ │ │ │ ├── ssd_bmm.py
│ │ │ │ │ ├── ssd_chunk_scan.py
│ │ │ │ │ ├── ssd_chunk_state.py
│ │ │ │ │ ├── ssd_combined.py
│ │ │ │ │ ├── ssd_state_passing.py
│ │ │ │ │ └── ssu_dispatch.py
│ │ │ │ ├── merge_state.py
│ │ │ │ ├── nsa/
│ │ │ │ │ ├── dequant_k_cache.py
│ │ │ │ │ ├── index_buf_accessor.py
│ │ │ │ │ ├── nsa_backend_mtp_precompute.py
│ │ │ │ │ ├── nsa_indexer.py
│ │ │ │ │ ├── nsa_mtp_verification.py
│ │ │ │ │ ├── quant_k_cache.py
│ │ │ │ │ ├── tilelang_kernel.py
│ │ │ │ │ ├── transform_index.py
│ │ │ │ │ ├── triton_kernel.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── nsa_backend.py
│ │ │ │ ├── tbo_backend.py
│ │ │ │ ├── torch_flex_backend.py
│ │ │ │ ├── torch_native_backend.py
│ │ │ │ ├── triton_backend.py
│ │ │ │ ├── triton_ops/
│ │ │ │ │ ├── decode_attention.py
│ │ │ │ │ ├── double_sparsity_attention.py
│ │ │ │ │ ├── extend_attention.py
│ │ │ │ │ ├── merge_state.py
│ │ │ │ │ ├── prefill_attention.py
│ │ │ │ │ ├── rocm_mla_decode_rope.py
│ │ │ │ │ └── trtllm_fp8_kv_kernel.py
│ │ │ │ ├── trtllm_mha_backend.py
│ │ │ │ ├── trtllm_mla_backend.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── vision.py
│ │ │ │ ├── vision_utils.py
│ │ │ │ ├── wave_backend.py
│ │ │ │ ├── wave_ops/
│ │ │ │ │ ├── decode_attention.py
│ │ │ │ │ ├── extend_attention.py
│ │ │ │ │ └── prefill_attention.py
│ │ │ │ └── xpu_backend.py
│ │ │ ├── communicator.py
│ │ │ ├── communicator_nsa_cp.py
│ │ │ ├── conv.py
│ │ │ ├── deep_gemm_wrapper/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── compile_utils.py
│ │ │ │ ├── configurer.py
│ │ │ │ └── entrypoint.py
│ │ │ ├── dp_attention.py
│ │ │ ├── elementwise.py
│ │ │ ├── flashinfer_comm_fusion.py
│ │ │ ├── int4fp8_utils.py
│ │ │ ├── layernorm.py
│ │ │ ├── linear.py
│ │ │ ├── logits_processor.py
│ │ │ ├── model_parallel.py
│ │ │ ├── modelopt_utils.py
│ │ │ ├── moe/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cutlass_moe.py
│ │ │ │ ├── cutlass_moe_params.py
│ │ │ │ ├── cutlass_w4a8_moe.py
│ │ │ │ ├── ep_moe/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── kernels.py
│ │ │ │ │ └── layer.py
│ │ │ │ ├── flashinfer_cutedsl_moe.py
│ │ │ │ ├── flashinfer_trtllm_moe.py
│ │ │ │ ├── fused_moe_native.py
│ │ │ │ ├── fused_moe_triton/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── configs/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── triton_3_1_0/
│ │ │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=1024,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json
│ │ │ │ │ │ │ ├── E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=1280,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=2560,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=320,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=64,N=640,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=14336,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=1792,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=2048,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=3584,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=4096,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI300X.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Instinct_MI325X.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=AMD_Radeon_Graphics.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=7168,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ └── E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ ├── triton_3_2_0/
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=128,N=96,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
│ │ │ │ │ │ │ └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── triton_3_3_0/
│ │ │ │ │ │ │ └── E=16,N=1024,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── triton_3_3_1/
│ │ │ │ │ │ │ ├── E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=768,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=320,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ └── E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── triton_3_4_0/
│ │ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_L40S.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=160,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=256,N=512,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=256,N=512,device_name=NVIDIA_H20.json
│ │ │ │ │ │ │ ├── E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ │ ├── E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=,dtype=int4_w4a16.json
│ │ │ │ │ │ │ ├── E=384,N=128,device_name=,dtype=int4_w4a16_down.json
│ │ │ │ │ │ │ ├── E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H20-3e.json
│ │ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H200.json
│ │ │ │ │ │ │ ├── E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ │ └── E=512,N=64,device_name=NVIDIA_H200.json
│ │ │ │ │ │ └── triton_3_5_1/
│ │ │ │ │ │ ├── E=128,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=232,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=232,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=464,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=16,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=16,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=16,N=2048,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H20,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=192,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition.json
│ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=161,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=20,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=20,N=1536,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=256,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=256,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=256,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=256,N=672,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=256,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ │ ├── E=32,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=32,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=32,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=32,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=40,N=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,per_channel_quant=True.json
│ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=128,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=512,N=1344,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ ├── E=512,N=256,device_name=NVIDIA_H200.json
│ │ │ │ │ │ ├── E=512,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=336,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=336,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=512,N=672,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=512,N=672,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=1856,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=2688,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=464,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=464,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=64,N=928,device_name=NVIDIA_B200.json
│ │ │ │ │ │ ├── E=64,N=928,device_name=NVIDIA_H100_80GB_HBM3.json
│ │ │ │ │ │ ├── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ │ └── E=80,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128]_down.json
│ │ │ │ │ ├── fused_marlin_moe.py
│ │ │ │ │ ├── fused_moe.py
│ │ │ │ │ ├── fused_moe_triton_config.py
│ │ │ │ │ ├── fused_moe_triton_kernels.py
│ │ │ │ │ ├── layer.py
│ │ │ │ │ ├── moe_align_block_size.py
│ │ │ │ │ └── triton_kernels_moe.py
│ │ │ │ ├── kt_ep_wrapper.py
│ │ │ │ ├── moe_runner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── deep_gemm.py
│ │ │ │ │ ├── flashinfer_trtllm.py
│ │ │ │ │ ├── marlin.py
│ │ │ │ │ ├── runner.py
│ │ │ │ │ ├── triton.py
│ │ │ │ │ └── triton_kernels.py
│ │ │ │ ├── rocm_moe_utils.py
│ │ │ │ ├── routed_experts_capturer.py
│ │ │ │ ├── router.py
│ │ │ │ ├── token_dispatcher/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── deepep.py
│ │ │ │ │ ├── flashinfer.py
│ │ │ │ │ ├── flashinfer_utils.py
│ │ │ │ │ ├── fuseep.py
│ │ │ │ │ ├── mooncake.py
│ │ │ │ │ ├── moriep.py
│ │ │ │ │ ├── nixl.py
│ │ │ │ │ └── standard.py
│ │ │ │ ├── topk.py
│ │ │ │ └── utils.py
│ │ │ ├── multimodal.py
│ │ │ ├── n_gram_embedding.py
│ │ │ ├── parameter.py
│ │ │ ├── pooler.py
│ │ │ ├── quantization/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── auto_round.py
│ │ │ │ ├── awq.py
│ │ │ │ ├── awq_triton.py
│ │ │ │ ├── base_config.py
│ │ │ │ ├── base_scheme.py
│ │ │ │ ├── bitsandbytes.py
│ │ │ │ ├── blockwise_int8.py
│ │ │ │ ├── compressed_tensors/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── compressed_tensors.py
│ │ │ │ │ ├── schemes/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── compressed_tensors_scheme.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_mxint4_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_nvfp4.py
│ │ │ │ │ │ ├── compressed_tensors_w4a4_nvfp4_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w4a8_int8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w8a16_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_fp8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_int8.py
│ │ │ │ │ │ ├── compressed_tensors_w8a8_int8_moe.py
│ │ │ │ │ │ ├── compressed_tensors_wNa16.py
│ │ │ │ │ │ └── compressed_tensors_wNa16_moe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── configs/
│ │ │ │ │ ├── N=1280,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=4096,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=1024,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=2048,device_name=NVIDIA_L40,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=6400,K=5120,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ ├── N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
│ │ │ │ │ └── README.md
│ │ │ │ ├── fp4_utils.py
│ │ │ │ ├── fp8.py
│ │ │ │ ├── fp8_kernel.py
│ │ │ │ ├── fp8_utils.py
│ │ │ │ ├── fpgemm_fp8.py
│ │ │ │ ├── gguf.py
│ │ │ │ ├── gptq.py
│ │ │ │ ├── int8_kernel.py
│ │ │ │ ├── int8_utils.py
│ │ │ │ ├── kv_cache.py
│ │ │ │ ├── kvfp4_tensor.py
│ │ │ │ ├── marlin_utils.py
│ │ │ │ ├── marlin_utils_fp8.py
│ │ │ │ ├── modelopt_quant.py
│ │ │ │ ├── modelslim/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── modelslim.py
│ │ │ │ │ └── schemes/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── modelslim_scheme.py
│ │ │ │ │ ├── modelslim_w4a4_int4.py
│ │ │ │ │ ├── modelslim_w4a4_int4_moe.py
│ │ │ │ │ ├── modelslim_w4a8_int8_moe.py
│ │ │ │ │ ├── modelslim_w8a8_int8.py
│ │ │ │ │ └── modelslim_w8a8_int8_moe.py
│ │ │ │ ├── moe_wna16.py
│ │ │ │ ├── mxfp4.py
│ │ │ │ ├── mxfp4_tensor.py
│ │ │ │ ├── petit.py
│ │ │ │ ├── petit_utils.py
│ │ │ │ ├── qoq.py
│ │ │ │ ├── quark/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── quark.py
│ │ │ │ │ ├── schemes/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── quark_scheme.py
│ │ │ │ │ │ ├── quark_w4a4_mxfp4.py
│ │ │ │ │ │ ├── quark_w4a4_mxfp4_moe.py
│ │ │ │ │ │ ├── quark_w8a8_fp8.py
│ │ │ │ │ │ └── quark_w8a8_fp8_moe.py
│ │ │ │ │ └── utils.py
│ │ │ │ ├── quark_int4fp8_moe.py
│ │ │ │ ├── rocm_mxfp4_utils.py
│ │ │ │ ├── unquant.py
│ │ │ │ ├── utils.py
│ │ │ │ ├── w4afp8.py
│ │ │ │ ├── w8a8_fp8.py
│ │ │ │ └── w8a8_int8.py
│ │ │ ├── radix_attention.py
│ │ │ ├── radix_linear_attention.py
│ │ │ ├── rocm_linear_utils.py
│ │ │ ├── rotary_embedding/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── factory.py
│ │ │ │ ├── mrope.py
│ │ │ │ ├── mrope_rope_index.py
│ │ │ │ ├── rope_variant.py
│ │ │ │ ├── triton_kernels.py
│ │ │ │ ├── utils.py
│ │ │ │ └── yarn.py
│ │ │ ├── sampler.py
│ │ │ ├── sparse_pooler.py
│ │ │ ├── torchao_utils.py
│ │ │ ├── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common.py
│ │ │ │ ├── hash.py
│ │ │ │ ├── logprob.py
│ │ │ │ └── multi_platform.py
│ │ │ └── vocab_parallel_embedding.py
│ │ ├── lora/
│ │ │ ├── backend/
│ │ │ │ ├── ascend_backend.py
│ │ │ │ ├── base_backend.py
│ │ │ │ ├── chunked_backend.py
│ │ │ │ ├── lmhead_mixing.py
│ │ │ │ ├── lora_registry.py
│ │ │ │ ├── torch_backend.py
│ │ │ │ └── triton_backend.py
│ │ │ ├── eviction_policy.py
│ │ │ ├── layers.py
│ │ │ ├── lora.py
│ │ │ ├── lora_config.py
│ │ │ ├── lora_manager.py
│ │ │ ├── lora_overlap_loader.py
│ │ │ ├── lora_registry.py
│ │ │ ├── mem_pool.py
│ │ │ ├── torch_ops/
│ │ │ │ ├── __init__.py
│ │ │ │ └── lora_ops.py
│ │ │ ├── triton_ops/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── chunked_embedding_lora_a.py
│ │ │ │ ├── chunked_sgmv_expand.py
│ │ │ │ ├── chunked_sgmv_shrink.py
│ │ │ │ ├── embedding_lora_a.py
│ │ │ │ ├── fused_moe_lora_kernel.py
│ │ │ │ ├── gate_up_lora_b.py
│ │ │ │ ├── qkv_lora_b.py
│ │ │ │ ├── sgemm_lora_a.py
│ │ │ │ └── sgemm_lora_b.py
│ │ │ └── utils.py
│ │ ├── managers/
│ │ │ ├── async_dynamic_batch_tokenizer.py
│ │ │ ├── async_mm_data_processor.py
│ │ │ ├── cache_controller.py
│ │ │ ├── configure_logging.py
│ │ │ ├── data_parallel_controller.py
│ │ │ ├── detokenizer_manager.py
│ │ │ ├── disagg_service.py
│ │ │ ├── io_struct.py
│ │ │ ├── mm_utils.py
│ │ │ ├── multi_tokenizer_mixin.py
│ │ │ ├── multimodal_processor.py
│ │ │ ├── overlap_utils.py
│ │ │ ├── prefill_delayer.py
│ │ │ ├── schedule_batch.py
│ │ │ ├── schedule_policy.py
│ │ │ ├── scheduler.py
│ │ │ ├── scheduler_dp_attn_mixin.py
│ │ │ ├── scheduler_input_blocker.py
│ │ │ ├── scheduler_output_processor_mixin.py
│ │ │ ├── scheduler_pp_mixin.py
│ │ │ ├── scheduler_profiler_mixin.py
│ │ │ ├── scheduler_recv_skipper.py
│ │ │ ├── scheduler_runtime_checker_mixin.py
│ │ │ ├── scheduler_update_weights_mixin.py
│ │ │ ├── session_controller.py
│ │ │ ├── template_manager.py
│ │ │ ├── tokenizer_communicator_mixin.py
│ │ │ ├── tokenizer_manager.py
│ │ │ ├── tokenizer_manager_multiitem_mixin.py
│ │ │ ├── tp_worker.py
│ │ │ └── utils.py
│ │ ├── mem_cache/
│ │ │ ├── allocator.py
│ │ │ ├── base_prefix_cache.py
│ │ │ ├── cache_init_params.py
│ │ │ ├── chunk_cache.py
│ │ │ ├── common.py
│ │ │ ├── cpp_radix_tree/
│ │ │ │ ├── common.h
│ │ │ │ ├── radix_tree.py
│ │ │ │ ├── tree_v2.cpp
│ │ │ │ ├── tree_v2.h
│ │ │ │ ├── tree_v2_binding.cpp
│ │ │ │ ├── tree_v2_debug.cpp
│ │ │ │ ├── tree_v2_impl.h
│ │ │ │ └── tree_v2_node.h
│ │ │ ├── evict_policy.py
│ │ │ ├── flush_cache.py
│ │ │ ├── hi_mamba_radix_cache.py
│ │ │ ├── hicache_storage.py
│ │ │ ├── hiradix_cache.py
│ │ │ ├── mamba_radix_cache.py
│ │ │ ├── memory_pool.py
│ │ │ ├── memory_pool_host.py
│ │ │ ├── multimodal_cache.py
│ │ │ ├── radix_cache.py
│ │ │ ├── radix_cache_cpp.py
│ │ │ ├── session_aware_cache.py
│ │ │ ├── sparsity/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── algorithms/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_algorithm.py
│ │ │ │ │ ├── deepseek_nsa.py
│ │ │ │ │ └── quest_algorithm.py
│ │ │ │ ├── backend/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── backend_adaptor.py
│ │ │ │ ├── core/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── sparse_coordinator.py
│ │ │ │ └── factory.py
│ │ │ ├── storage/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── aibrix_kvcache/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── aibrix_kvcache_storage.py
│ │ │ │ │ └── unit_test.py
│ │ │ │ ├── backend_factory.py
│ │ │ │ ├── eic/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── eic_storage.py
│ │ │ │ │ └── test_unit.py
│ │ │ │ ├── hf3fs/
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ ├── README.md
│ │ │ │ │ │ ├── deploy_sglang_3fs_multinode.md
│ │ │ │ │ │ └── setup_usrbio_client.md
│ │ │ │ │ ├── hf3fs_client.py
│ │ │ │ │ ├── hf3fs_usrbio_client.py
│ │ │ │ │ ├── hf3fs_utils.cpp
│ │ │ │ │ ├── mini_3fs_metadata_server.py
│ │ │ │ │ ├── storage_hf3fs.py
│ │ │ │ │ └── test_hf3fs_utils.py
│ │ │ │ ├── lmcache/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── example_config.yaml
│ │ │ │ │ ├── lmc_radix_cache.py
│ │ │ │ │ └── unit_test.py
│ │ │ │ ├── mooncake_store/
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── embedding_cache_controller.py
│ │ │ │ │ ├── mooncake_embedding_store.py
│ │ │ │ │ ├── mooncake_store.py
│ │ │ │ │ └── test_mooncake_store.py
│ │ │ │ └── nixl/
│ │ │ │ ├── README.md
│ │ │ │ ├── hicache_nixl.py
│ │ │ │ ├── nixl.config.toml.sample
│ │ │ │ ├── nixl_utils.py
│ │ │ │ └── test_hicache_nixl_storage.py
│ │ │ ├── swa_memory_pool.py
│ │ │ ├── swa_radix_cache.py
│ │ │ └── utils.py
│ │ ├── model_executor/
│ │ │ ├── cpu_graph_runner.py
│ │ │ ├── cuda_graph_runner.py
│ │ │ ├── forward_batch_deepseek_mha_mixin.py
│ │ │ ├── forward_batch_info.py
│ │ │ ├── hook_manager.py
│ │ │ ├── input_buffers.py
│ │ │ ├── mindspore_runner.py
│ │ │ ├── model_runner.py
│ │ │ ├── model_runner_kv_cache_mixin.py
│ │ │ └── piecewise_cuda_graph_runner.py
│ │ ├── model_loader/
│ │ │ ├── __init__.py
│ │ │ ├── ci_weight_validation.py
│ │ │ ├── loader.py
│ │ │ ├── remote_instance_weight_loader_utils.py
│ │ │ ├── utils.py
│ │ │ └── weight_utils.py
│ │ ├── models/
│ │ │ ├── afmoe.py
│ │ │ ├── apertus.py
│ │ │ ├── arcee.py
│ │ │ ├── baichuan.py
│ │ │ ├── bailing_moe.py
│ │ │ ├── bailing_moe_linear.py
│ │ │ ├── bailing_moe_nextn.py
│ │ │ ├── bert.py
│ │ │ ├── chatglm.py
│ │ │ ├── clip.py
│ │ │ ├── commandr.py
│ │ │ ├── dbrx.py
│ │ │ ├── deepseek.py
│ │ │ ├── deepseek_common/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── attention_backend_handler.py
│ │ │ │ ├── attention_forward_methods/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── forward_methods.py
│ │ │ │ │ ├── forward_mha.py
│ │ │ │ │ ├── forward_mla.py
│ │ │ │ │ ├── forward_mla_fused_rope_cpu.py
│ │ │ │ │ └── forward_mla_fused_rope_rocm.py
│ │ │ │ ├── deepseek_weight_loader.py
│ │ │ │ └── utils.py
│ │ │ ├── deepseek_janus_pro.py
│ │ │ ├── deepseek_nextn.py
│ │ │ ├── deepseek_ocr.py
│ │ │ ├── deepseek_v2.py
│ │ │ ├── deepseek_vl2.py
│ │ │ ├── dots_ocr.py
│ │ │ ├── dots_vlm.py
│ │ │ ├── dots_vlm_vit.py
│ │ │ ├── ernie4.py
│ │ │ ├── ernie45_moe_vl.py
│ │ │ ├── ernie45_vl.py
│ │ │ ├── ernie4_eagle.py
│ │ │ ├── exaone.py
│ │ │ ├── exaone4.py
│ │ │ ├── exaone_moe.py
│ │ │ ├── exaone_moe_mtp.py
│ │ │ ├── falcon_h1.py
│ │ │ ├── gemma.py
│ │ │ ├── gemma2.py
│ │ │ ├── gemma2_reward.py
│ │ │ ├── gemma3_causal.py
│ │ │ ├── gemma3_mm.py
│ │ │ ├── gemma3n_audio.py
│ │ │ ├── gemma3n_causal.py
│ │ │ ├── gemma3n_mm.py
│ │ │ ├── glm4.py
│ │ │ ├── glm4_moe.py
│ │ │ ├── glm4_moe_lite.py
│ │ │ ├── glm4_moe_nextn.py
│ │ │ ├── glm4v.py
│ │ │ ├── glm4v_moe.py
│ │ │ ├── glm_ocr.py
│ │ │ ├── glm_ocr_nextn.py
│ │ │ ├── glmasr.py
│ │ │ ├── gpt2.py
│ │ │ ├── gpt_bigcode.py
│ │ │ ├── gpt_j.py
│ │ │ ├── gpt_oss.py
│ │ │ ├── granite.py
│ │ │ ├── granitemoe.py
│ │ │ ├── granitemoehybrid.py
│ │ │ ├── grok.py
│ │ │ ├── hunyuan.py
│ │ │ ├── idefics2.py
│ │ │ ├── internlm2.py
│ │ │ ├── internlm2_reward.py
│ │ │ ├── interns1.py
│ │ │ ├── interns1pro.py
│ │ │ ├── internvl.py
│ │ │ ├── iquest_loopcoder.py
│ │ │ ├── jet_nemotron.py
│ │ │ ├── jet_vlm.py
│ │ │ ├── kimi_k25.py
│ │ │ ├── kimi_linear.py
│ │ │ ├── kimi_vl.py
│ │ │ ├── kimi_vl_moonvit.py
│ │ │ ├── lfm2.py
│ │ │ ├── lfm2_moe.py
│ │ │ ├── lightonocr.py
│ │ │ ├── llada2.py
│ │ │ ├── llama.py
│ │ │ ├── llama4.py
│ │ │ ├── llama_classification.py
│ │ │ ├── llama_eagle.py
│ │ │ ├── llama_eagle3.py
│ │ │ ├── llama_embedding.py
│ │ │ ├── llama_reward.py
│ │ │ ├── llava.py
│ │ │ ├── llavavid.py
│ │ │ ├── longcat_flash.py
│ │ │ ├── longcat_flash_nextn.py
│ │ │ ├── midashenglm.py
│ │ │ ├── mimo.py
│ │ │ ├── mimo_mtp.py
│ │ │ ├── mimo_v2_flash.py
│ │ │ ├── mimo_v2_flash_nextn.py
│ │ │ ├── mindspore.py
│ │ │ ├── minicpm.py
│ │ │ ├── minicpm3.py
│ │ │ ├── minicpmo.py
│ │ │ ├── minicpmv.py
│ │ │ ├── minimax_m2.py
│ │ │ ├── ministral3.py
│ │ │ ├── mistral.py
│ │ │ ├── mistral_large_3.py
│ │ │ ├── mistral_large_3_eagle.py
│ │ │ ├── mixtral.py
│ │ │ ├── mixtral_quant.py
│ │ │ ├── mllama.py
│ │ │ ├── mllama4.py
│ │ │ ├── nano_nemotron_vl.py
│ │ │ ├── nemotron_h.py
│ │ │ ├── nemotron_h_mtp.py
│ │ │ ├── nemotron_nas.py
│ │ │ ├── nvila.py
│ │ │ ├── nvila_lite.py
│ │ │ ├── olmo.py
│ │ │ ├── olmo2.py
│ │ │ ├── olmoe.py
│ │ │ ├── opt.py
│ │ │ ├── orion.py
│ │ │ ├── paddleocr_vl.py
│ │ │ ├── persimmon.py
│ │ │ ├── phi.py
│ │ │ ├── phi3_small.py
│ │ │ ├── phi4mm.py
│ │ │ ├── phi4mm_audio.py
│ │ │ ├── phi4mm_utils.py
│ │ │ ├── phimoe.py
│ │ │ ├── pixtral.py
│ │ │ ├── points_v15_chat.py
│ │ │ ├── qwen.py
│ │ │ ├── qwen2.py
│ │ │ ├── qwen2_5_vl.py
│ │ │ ├── qwen2_audio.py
│ │ │ ├── qwen2_classification.py
│ │ │ ├── qwen2_eagle.py
│ │ │ ├── qwen2_moe.py
│ │ │ ├── qwen2_rm.py
│ │ │ ├── qwen2_vl.py
│ │ │ ├── qwen3.py
│ │ │ ├── qwen3_5.py
│ │ │ ├── qwen3_5_mtp.py
│ │ │ ├── qwen3_classification.py
│ │ │ ├── qwen3_moe.py
│ │ │ ├── qwen3_next.py
│ │ │ ├── qwen3_next_mtp.py
│ │ │ ├── qwen3_omni_moe.py
│ │ │ ├── qwen3_rm.py
│ │ │ ├── qwen3_vl.py
│ │ │ ├── qwen3_vl_moe.py
│ │ │ ├── radio.py
│ │ │ ├── registry.py
│ │ │ ├── roberta.py
│ │ │ ├── sarashina2_vision.py
│ │ │ ├── sarvam_moe.py
│ │ │ ├── sdar.py
│ │ │ ├── sdar_moe.py
│ │ │ ├── siglip.py
│ │ │ ├── solar.py
│ │ │ ├── stablelm.py
│ │ │ ├── starcoder2.py
│ │ │ ├── step3_vl.py
│ │ │ ├── step3_vl_10b.py
│ │ │ ├── step3p5.py
│ │ │ ├── step3p5_mtp.py
│ │ │ ├── teleflm.py
│ │ │ ├── torch_native_llama.py
│ │ │ ├── transformers.py
│ │ │ ├── utils.py
│ │ │ ├── whisper.py
│ │ │ ├── xverse.py
│ │ │ ├── xverse_moe.py
│ │ │ └── yivl.py
│ │ ├── multimodal/
│ │ │ ├── customized_mm_processor_utils.py
│ │ │ ├── evs/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── evs_core.py
│ │ │ │ ├── evs_module.py
│ │ │ │ └── evs_processor.py
│ │ │ ├── internvl_utils.py
│ │ │ ├── internvl_vit_cuda_graph_runner.py
│ │ │ ├── mm_utils.py
│ │ │ ├── processors/
│ │ │ │ ├── base_processor.py
│ │ │ │ ├── clip.py
│ │ │ │ ├── deepseek_ocr.py
│ │ │ │ ├── deepseek_vl_v2.py
│ │ │ │ ├── dots_vlm.py
│ │ │ │ ├── ernie45_vl.py
│ │ │ │ ├── gemma3.py
│ │ │ │ ├── gemma3n.py
│ │ │ │ ├── glm4v.py
│ │ │ │ ├── glmasr.py
│ │ │ │ ├── interns1pro.py
│ │ │ │ ├── internvl.py
│ │ │ │ ├── janus_pro.py
│ │ │ │ ├── kimi_k25.py
│ │ │ │ ├── kimi_vl.py
│ │ │ │ ├── lightonocr.py
│ │ │ │ ├── llava.py
│ │ │ │ ├── midashenglm.py
│ │ │ │ ├── minicpm.py
│ │ │ │ ├── mlama.py
│ │ │ │ ├── mllama4.py
│ │ │ │ ├── nano_nemotron_vl.py
│ │ │ │ ├── nvila.py
│ │ │ │ ├── paddleocr_vlm.py
│ │ │ │ ├── phi4mm.py
│ │ │ │ ├── pixtral.py
│ │ │ │ ├── points_v15_chat.py
│ │ │ │ ├── qwen_audio.py
│ │ │ │ ├── qwen_vl.py
│ │ │ │ ├── sarashina2_vision.py
│ │ │ │ ├── step3_vl.py
│ │ │ │ └── whisper.py
│ │ │ └── vit_cuda_graph_runner.py
│ │ ├── multiplex/
│ │ │ ├── multiplexing_mixin.py
│ │ │ └── pdmux_context.py
│ │ ├── observability/
│ │ │ ├── cpu_monitor.py
│ │ │ ├── func_timer.py
│ │ │ ├── label_transform.py
│ │ │ ├── metrics_collector.py
│ │ │ ├── req_time_stats.py
│ │ │ ├── request_metrics_exporter.py
│ │ │ ├── scheduler_metrics_mixin.py
│ │ │ ├── startup_func_log_and_timer.py
│ │ │ ├── trace.py
│ │ │ └── utils.py
│ │ ├── parser/
│ │ │ ├── code_completion_parser.py
│ │ │ ├── conversation.py
│ │ │ ├── harmony_parser.py
│ │ │ ├── jinja_template_utils.py
│ │ │ └── reasoning_parser.py
│ │ ├── ray/
│ │ │ ├── __init__.py
│ │ │ ├── engine.py
│ │ │ ├── http_server.py
│ │ │ └── scheduler_actor.py
│ │ ├── sampling/
│ │ │ ├── custom_logit_processor.py
│ │ │ ├── penaltylib/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── frequency_penalty.py
│ │ │ │ ├── min_new_tokens.py
│ │ │ │ ├── orchestrator.py
│ │ │ │ └── presence_penalty.py
│ │ │ ├── sampling_batch_info.py
│ │ │ └── sampling_params.py
│ │ ├── server_args.py
│ │ ├── server_args_config_parser.py
│ │ ├── speculative/
│ │ │ ├── base_spec_worker.py
│ │ │ ├── cpp_ngram/
│ │ │ │ ├── .clang-format
│ │ │ │ ├── ngram.cpp
│ │ │ │ ├── ngram.h
│ │ │ │ ├── ngram_cache.py
│ │ │ │ ├── ngram_cache_binding.cpp
│ │ │ │ ├── param.h
│ │ │ │ └── queue.h
│ │ │ ├── draft_utils.py
│ │ │ ├── eagle_draft_cuda_graph_runner.py
│ │ │ ├── eagle_draft_extend_cuda_graph_runner.py
│ │ │ ├── eagle_info.py
│ │ │ ├── eagle_info_v2.py
│ │ │ ├── eagle_utils.py
│ │ │ ├── eagle_worker.py
│ │ │ ├── eagle_worker_v2.py
│ │ │ ├── multi_layer_eagle_draft_extend_cuda_graph_runner.py
│ │ │ ├── multi_layer_eagle_utils.py
│ │ │ ├── multi_layer_eagle_worker.py
│ │ │ ├── multi_layer_eagle_worker_v2.py
│ │ │ ├── ngram_info.py
│ │ │ ├── ngram_worker.py
│ │ │ ├── spec_info.py
│ │ │ ├── spec_utils.py
│ │ │ ├── standalone_worker.py
│ │ │ └── standalone_worker_v2.py
│ │ ├── tokenizer/
│ │ │ └── tiktoken_tokenizer.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── aio_rwlock.py
│ │ │ ├── auth.py
│ │ │ ├── bench_utils.py
│ │ │ ├── common.py
│ │ │ ├── cuda_ipc_transport_utils.py
│ │ │ ├── custom_op.py
│ │ │ ├── device_timer.py
│ │ │ ├── gauge_histogram.py
│ │ │ ├── hf_transformers_utils.py
│ │ │ ├── host_shared_memory.py
│ │ │ ├── json_response.py
│ │ │ ├── log_utils.py
│ │ │ ├── mistral_utils.py
│ │ │ ├── model_file_verifier.py
│ │ │ ├── multi_stream_utils.py
│ │ │ ├── network.py
│ │ │ ├── numa_utils.py
│ │ │ ├── nvtx_pytorch_hooks.py
│ │ │ ├── offloader.py
│ │ │ ├── patch_tokenizer.py
│ │ │ ├── patch_torch.py
│ │ │ ├── poll_based_barrier.py
│ │ │ ├── profile_merger.py
│ │ │ ├── profile_utils.py
│ │ │ ├── request_logger.py
│ │ │ ├── rpd_utils.py
│ │ │ ├── scheduler_status_logger.py
│ │ │ ├── slow_rank_detector.py
│ │ │ ├── torch_memory_saver_adapter.py
│ │ │ ├── video_decoder.py
│ │ │ ├── watchdog.py
│ │ │ └── weight_checker.py
│ │ └── weight_sync/
│ │ ├── tensor_bucket.py
│ │ └── utils.py
│ ├── test/
│ │ ├── __init__.py
│ │ ├── accuracy_test_runner.py
│ │ ├── ascend/
│ │ │ ├── __init__.py
│ │ │ ├── disaggregation_utils.py
│ │ │ ├── gsm8k_ascend_mixin.py
│ │ │ ├── test_ascend_utils.py
│ │ │ └── vlm_utils.py
│ │ ├── attention/
│ │ │ ├── __init__.py
│ │ │ ├── test_flashattn_backend.py
│ │ │ ├── test_flashattn_mla_backend.py
│ │ │ ├── test_prefix_chunk_info.py
│ │ │ └── test_trtllm_mla_backend.py
│ │ ├── bench_one_batch_server_internal.py
│ │ ├── ci/
│ │ │ ├── __init__.py
│ │ │ ├── ci_register.py
│ │ │ ├── ci_stress_utils.py
│ │ │ ├── ci_utils.py
│ │ │ └── run_with_retry.py
│ │ ├── doc_patch.py
│ │ ├── external_models/
│ │ │ └── custom_qwen2_vl.py
│ │ ├── few_shot_gsm8k.py
│ │ ├── few_shot_gsm8k_engine.py
│ │ ├── get_logits_ut.py
│ │ ├── gpt_oss_common.py
│ │ ├── kits/
│ │ │ ├── abort_timeout_kit.py
│ │ │ ├── cache_hit_kit.py
│ │ │ ├── ebnf_constrained_kit.py
│ │ │ ├── gsm8k_accuracy_kit.py
│ │ │ ├── json_constrained_kit.py
│ │ │ ├── kl_divergence_kit.py
│ │ │ ├── lm_eval_kit.py
│ │ │ ├── matched_stop_kit.py
│ │ │ ├── mmmu_vlm_kit.py
│ │ │ ├── prefix_cache_branching_kit.py
│ │ │ ├── radix_cache_server_kit.py
│ │ │ ├── regex_constrained_kit.py
│ │ │ └── spec_decoding_kit.py
│ │ ├── kl_test_utils.py
│ │ ├── long_prompt.txt
│ │ ├── longbench_v2/
│ │ │ ├── __init__.py
│ │ │ ├── longbench_v2_evaluation.md
│ │ │ ├── test_longbench_v2_eval.py
│ │ │ ├── validate_longbench_v2.py
│ │ │ └── validate_longbench_v2_standalone.py
│ │ ├── lora_utils.py
│ │ ├── nightly_bench_utils.py
│ │ ├── nightly_utils.py
│ │ ├── performance_test_runner.py
│ │ ├── run_combined_tests.py
│ │ ├── run_eval.py
│ │ ├── runners.py
│ │ ├── send_one.py
│ │ ├── server_fixtures/
│ │ │ ├── default_fixture.py
│ │ │ ├── disaggregation_fixture.py
│ │ │ ├── eagle_fixture.py
│ │ │ └── mmmu_fixture.py
│ │ ├── simple_eval_aime25.py
│ │ ├── simple_eval_common.py
│ │ ├── simple_eval_gpqa.py
│ │ ├── simple_eval_gsm8k.py
│ │ ├── simple_eval_humaneval.py
│ │ ├── simple_eval_longbench_v2.py
│ │ ├── simple_eval_math.py
│ │ ├── simple_eval_mgsm.py
│ │ ├── simple_eval_mmlu.py
│ │ ├── simple_eval_mmmu_vlm.py
│ │ ├── speculative/
│ │ │ └── test_spec_utils.py
│ │ ├── test_activation.py
│ │ ├── test_block_fp8.py
│ │ ├── test_block_fp8_deep_gemm_blackwell.py
│ │ ├── test_custom_ops.py
│ │ ├── test_cutlass_moe.py
│ │ ├── test_cutlass_w16a16_moe.py
│ │ ├── test_cutlass_w4a8_moe.py
│ │ ├── test_deepep_utils.py
│ │ ├── test_deterministic.py
│ │ ├── test_deterministic_utils.py
│ │ ├── test_dump_metric.py
│ │ ├── test_dynamic_grad_mode.py
│ │ ├── test_flashinfer_dispatcher.py
│ │ ├── test_http_server_auth.py
│ │ ├── test_kvfp4_quant_dequant.py
│ │ ├── test_layernorm.py
│ │ ├── test_marlin_utils.py
│ │ ├── test_programs.py
│ │ ├── test_utils.py
│ │ ├── tool_call_test_runner.py
│ │ └── vlm_utils.py
│ ├── utils.py
│ └── version.py
├── scripts/
│ ├── check_vram_clear.sh
│ ├── ci/
│ │ ├── amd/
│ │ │ ├── amd_ci_exec.sh
│ │ │ ├── amd_ci_install_dependency.sh
│ │ │ ├── amd_ci_start_container.sh
│ │ │ ├── amd_ci_start_container_disagg.sh
│ │ │ ├── amd_ci_warmup_aiter.py
│ │ │ └── test_rccl_multi_gpu.py
│ │ ├── cuda/
│ │ │ ├── ci_download_flashinfer_cubin.sh
│ │ │ ├── ci_install_deepep.sh
│ │ │ ├── ci_install_dependency.sh
│ │ │ ├── ci_install_gateway_dependencies.sh
│ │ │ ├── ci_start_disaggregation_servers.sh
│ │ │ ├── prepare_runner.sh
│ │ │ ├── warmup_deep_gemm.py
│ │ │ └── warmup_server.py
│ │ ├── musa/
│ │ │ ├── musa_install_dependency.sh
│ │ │ └── rename_wheels_musa.sh
│ │ ├── npu/
│ │ │ ├── npu_ci_install_dependency.sh
│ │ │ └── npu_log_print.sh
│ │ └── utils/
│ │ ├── ci_coverage_report.py
│ │ ├── cleanup_hf_cache.py
│ │ ├── merge_metrics.py
│ │ ├── prevalidate_cached_models.py
│ │ ├── publish_diffusion_gt.py
│ │ ├── publish_traces.py
│ │ ├── query_job_status.py
│ │ ├── runner_utilization_report.py
│ │ ├── save_diffusion_metrics.py
│ │ ├── save_metrics.py
│ │ └── slash_command_handler.py
│ ├── ci_monitor/
│ │ ├── README.md
│ │ ├── ci_failures_analysis.py
│ │ └── post_ci_failures_to_slack.py
│ ├── code_sync/
│ │ ├── check_commits.py
│ │ ├── copy_from_oss.py
│ │ ├── copy_to_oss.py
│ │ ├── guideline.md
│ │ ├── install_github_cli.sh
│ │ └── utils.py
│ ├── convert_otel_2_perfetto.py
│ ├── ensure_vram_clear.sh
│ ├── export_deepseek_nextn.py
│ ├── killall_sglang.sh
│ ├── playground/
│ │ ├── bench_speculative.py
│ │ ├── disaggregation/
│ │ │ ├── cli-logprob.py
│ │ │ ├── cli-so.py
│ │ │ └── cli.py
│ │ ├── frontend_reasoning.ipynb
│ │ ├── load_tokenizer.py
│ │ ├── long_context_example.py
│ │ ├── lora/
│ │ │ ├── analyzer.py
│ │ │ ├── lora_hf_play.py
│ │ │ └── lora_vllm_play.py
│ │ ├── reference_hf.py
│ │ ├── replay_request_dump.py
│ │ └── router/
│ │ ├── test_tree.py
│ │ └── tree.py
│ ├── release/
│ │ ├── README.md
│ │ ├── bump_flashinfer_version.py
│ │ ├── bump_kernel_version.py
│ │ ├── bump_kernel_version_to_sglang.py
│ │ ├── bump_sglang_version.py
│ │ ├── check_kernel_version_to_sglang.py
│ │ ├── commit_and_pr.sh
│ │ ├── commit_and_pr_kernel_to_sglang.sh
│ │ ├── test_utils.py
│ │ └── utils.py
│ ├── sort_testcases_alphabetically.py
│ ├── update_kernel_whl_index.py
│ ├── update_nightly_whl_index.py
│ ├── update_pr_whl_index.py
│ └── version_branch_to_tag.sh
├── sgl-kernel/
│ ├── .clang-format
│ ├── CMakeLists.txt
│ ├── Dockerfile
│ ├── LICENSE
│ ├── Makefile
│ ├── README.md
│ ├── THIRDPARTYNOTICES.txt
│ ├── analyze_whl_kernel_sizes.py
│ ├── benchmark/
│ │ ├── bench_activation.py
│ │ ├── bench_amd_deterministic_allreduce.py
│ │ ├── bench_awq_dequant.py
│ │ ├── bench_cutlass_mla.py
│ │ ├── bench_dsv3_fused_a_gemm.py
│ │ ├── bench_dsv3_router_gemm.py
│ │ ├── bench_es_fp8_blockwise_grouped_gemm.py
│ │ ├── bench_fp4_gemm.py
│ │ ├── bench_fp8_blockwise_gemm.py
│ │ ├── bench_fp8_blockwise_group_gemm.py
│ │ ├── bench_fp8_gemm.py
│ │ ├── bench_int8_gemm.py
│ │ ├── bench_kimi_k2_moe_fused_gate.py
│ │ ├── bench_moe_align_block_size.py
│ │ ├── bench_moe_ep_post_reorder.py
│ │ ├── bench_moe_fused_gate.py
│ │ ├── bench_moe_topk_sigmoid.py
│ │ ├── bench_moe_topk_softmax.py
│ │ ├── bench_mrope.py
│ │ ├── bench_per_tensor_quant_fp8.py
│ │ ├── bench_per_token_group_quant_8bit.py
│ │ ├── bench_per_token_quant_fp8.py
│ │ ├── bench_qserve_w4a8_gemm.py
│ │ ├── bench_rmsnorm.py
│ │ ├── bench_rotary_embedding.py
│ │ ├── bench_sum_scale.py
│ │ └── bench_top_k_top_p_sampling.py
│ ├── build.sh
│ ├── cmake/
│ │ ├── flashmla.cmake
│ │ └── utils.cmake
│ ├── csrc/
│ │ ├── allreduce/
│ │ │ ├── custom_all_reduce.cu
│ │ │ ├── custom_all_reduce.cuh
│ │ │ ├── custom_all_reduce.hip
│ │ │ ├── custom_all_reduce_hip.cuh
│ │ │ ├── deterministic_all_reduce.hip
│ │ │ ├── mscclpp_allreduce.cu
│ │ │ ├── mscclpp_allreduce.cuh
│ │ │ ├── quick_all_reduce.cu
│ │ │ ├── quick_all_reduce.cuh
│ │ │ ├── quick_all_reduce.h
│ │ │ ├── quick_all_reduce_base.h
│ │ │ └── test_mscclpp_allreduce.cu
│ │ ├── attention/
│ │ │ ├── cascade.cu
│ │ │ ├── cutlass_mla_kernel.cu
│ │ │ ├── cutlass_sm100_mla/
│ │ │ │ ├── device/
│ │ │ │ │ └── sm100_mla.hpp
│ │ │ │ └── kernel/
│ │ │ │ ├── sm100_fmha_mla_reduction.hpp
│ │ │ │ ├── sm100_fmha_mla_tma_warpspecialized.hpp
│ │ │ │ └── sm100_mla_tile_scheduler.hpp
│ │ │ ├── merge_attn_states.cu
│ │ │ └── vertical_slash_index.cu
│ │ ├── common_extension.cc
│ │ ├── common_extension_musa.cc
│ │ ├── common_extension_rocm.cc
│ │ ├── cpu/
│ │ │ ├── CMakeLists.txt
│ │ │ ├── aarch64/
│ │ │ │ └── shm.h
│ │ │ ├── activation.cpp
│ │ │ ├── bmm.cpp
│ │ │ ├── common.h
│ │ │ ├── conv3d.cpp
│ │ │ ├── decode.cpp
│ │ │ ├── extend.cpp
│ │ │ ├── flash_attn.cpp
│ │ │ ├── flash_attn.h
│ │ │ ├── gemm.cpp
│ │ │ ├── gemm.h
│ │ │ ├── gemm_fp8.cpp
│ │ │ ├── gemm_int4.cpp
│ │ │ ├── gemm_int8.cpp
│ │ │ ├── interface.cpp
│ │ │ ├── mamba/
│ │ │ │ ├── conv.cpp
│ │ │ │ └── fla.cpp
│ │ │ ├── model/
│ │ │ │ └── qwen3.cpp
│ │ │ ├── moe.cpp
│ │ │ ├── moe_fp8.cpp
│ │ │ ├── moe_int4.cpp
│ │ │ ├── moe_int8.cpp
│ │ │ ├── norm.cpp
│ │ │ ├── numa_utils.cpp
│ │ │ ├── preprocessor.cpp
│ │ │ ├── qkv_proj.cpp
│ │ │ ├── rope.cpp
│ │ │ ├── shm.cpp
│ │ │ ├── shm.h
│ │ │ ├── topk.cpp
│ │ │ ├── torch_extension_cpu.cpp
│ │ │ ├── vec.h
│ │ │ ├── vec_pack.h
│ │ │ └── x86_64/
│ │ │ └── shm.h
│ │ ├── cutlass_extensions/
│ │ │ ├── common.hpp
│ │ │ ├── detail/
│ │ │ │ └── collective/
│ │ │ │ └── mixed_input_utils.hpp
│ │ │ ├── epilogue/
│ │ │ │ └── epilogue_per_row_per_col_scale.h
│ │ │ └── gemm/
│ │ │ ├── collective/
│ │ │ │ ├── builders/
│ │ │ │ │ └── sm90_gmma_builder_mixed_input.inl
│ │ │ │ ├── collective_builder_mixed_input.hpp
│ │ │ │ ├── collective_mma_array_mixed_input.hpp
│ │ │ │ └── sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
│ │ │ ├── cutlass_gemm_caller.cuh
│ │ │ ├── dispatch_policy.hpp
│ │ │ ├── fp8_blockwise_gemm_sm90_dispatch.cuh
│ │ │ ├── gemm_universal_base_compat.h
│ │ │ └── gemm_with_epilogue_visitor.h
│ │ ├── elementwise/
│ │ │ ├── activation.cu
│ │ │ ├── cast.cu
│ │ │ ├── concat_mla.cu
│ │ │ ├── copy.cu
│ │ │ ├── fused_add_rms_norm_kernel.cu
│ │ │ ├── pos_enc.cu
│ │ │ ├── pos_enc.cuh
│ │ │ ├── topk.cu
│ │ │ └── utils.cuh
│ │ ├── expert_specialization/
│ │ │ ├── es_fp8_blockwise.cu
│ │ │ ├── es_fp8_blockwise_functor.cuh
│ │ │ ├── es_fp8_blockwise_launcher.cuh
│ │ │ ├── es_fp8_blockwise_traits.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled.cu
│ │ │ ├── es_sm100_mxfp8_blockscaled_functor.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled_group_quant.cu
│ │ │ ├── es_sm100_mxfp8_blockscaled_group_quant.cuh
│ │ │ ├── es_sm100_mxfp8_blockscaled_launcher.cuh
│ │ │ └── es_sm100_mxfp8_blockscaled_traits.cuh
│ │ ├── flash_extension.cc
│ │ ├── flashmla_extension.cc
│ │ ├── gemm/
│ │ │ ├── awq_kernel.cu
│ │ │ ├── bmm_fp8.cu
│ │ │ ├── dsv3_fused_a_gemm.cu
│ │ │ ├── dsv3_router_gemm_bf16_out.cu
│ │ │ ├── dsv3_router_gemm_entry.cu
│ │ │ ├── dsv3_router_gemm_float_out.cu
│ │ │ ├── fp8_blockwise_gemm_kernel.cu
│ │ │ ├── fp8_gemm_kernel.cu
│ │ │ ├── gptq/
│ │ │ │ ├── compat.cuh
│ │ │ │ ├── gptq_kernel.cu
│ │ │ │ ├── matrix_view.cuh
│ │ │ │ ├── qdq_2.cuh
│ │ │ │ ├── qdq_3.cuh
│ │ │ │ ├── qdq_4.cuh
│ │ │ │ ├── qdq_8.cuh
│ │ │ │ └── qdq_util.cuh
│ │ │ ├── int8_gemm_kernel.cu
│ │ │ ├── marlin/
│ │ │ │ ├── dequant.h
│ │ │ │ ├── kernel.h
│ │ │ │ ├── marlin.cuh
│ │ │ │ ├── marlin_dtypes.cuh
│ │ │ │ └── marlin_template.h
│ │ │ ├── math.hpp
│ │ │ ├── per_token_group_quant_8bit.cu
│ │ │ ├── per_token_group_quant_8bit_v2.cu
│ │ │ ├── per_token_quant_fp8.cu
│ │ │ ├── qserve_w4a8_per_chn_gemm.cu
│ │ │ └── qserve_w4a8_per_group_gemm.cu
│ │ ├── grammar/
│ │ │ └── apply_token_bitmask_inplace_cuda.cu
│ │ ├── kvcacheio/
│ │ │ └── transfer.cu
│ │ ├── mamba/
│ │ │ ├── causal_conv1d.cu
│ │ │ └── causal_conv1d.h
│ │ ├── memory/
│ │ │ └── weak_ref_tensor.cpp
│ │ ├── moe/
│ │ │ ├── cutlass_moe/
│ │ │ │ └── w4a8/
│ │ │ │ ├── scaled_mm_entry.cu
│ │ │ │ ├── w4a8_get_group_starts.cuh
│ │ │ │ ├── w4a8_grouped_mm_c3x.cu
│ │ │ │ ├── w4a8_grouped_mm_c3x.cuh
│ │ │ │ └── w4a8_moe_data.cu
│ │ │ ├── cutlass_moe_helper.cu
│ │ │ ├── fp8_blockwise_moe_kernel.cu
│ │ │ ├── fused_qknorm_rope_kernel.cu
│ │ │ ├── kimi_k2_moe_fused_gate.cu
│ │ │ ├── moe_align_kernel.cu
│ │ │ ├── moe_fused_gate.cu
│ │ │ ├── moe_sum.cu
│ │ │ ├── moe_sum_reduce.cu
│ │ │ ├── moe_topk_sigmoid_kernels.cu
│ │ │ ├── moe_topk_softmax_kernels.cu
│ │ │ └── prepare_moe_input.cu
│ │ ├── quantization/
│ │ │ └── gguf/
│ │ │ ├── dequantize.cuh
│ │ │ ├── ggml-common.h
│ │ │ ├── gguf_kernel.cu
│ │ │ ├── mmq.cuh
│ │ │ ├── mmvq.cuh
│ │ │ ├── moe.cuh
│ │ │ ├── moe_vec.cuh
│ │ │ └── vecdotq.cuh
│ │ ├── spatial/
│ │ │ ├── cuda_utils.h
│ │ │ ├── greenctx_stream.cu
│ │ │ └── greenctx_stream.h
│ │ ├── spatial_extension.cc
│ │ └── speculative/
│ │ ├── eagle_utils.cu
│ │ ├── ngram_utils.cu
│ │ ├── packbit.cu
│ │ ├── speculative_sampling.cu
│ │ └── speculative_sampling.cuh
│ ├── include/
│ │ ├── hip/
│ │ │ ├── hip_act_and_mul.cuh
│ │ │ ├── hip_math_def.h
│ │ │ ├── hip_vec_dtypes.h
│ │ │ └── impl/
│ │ │ ├── hip_vec_bf16_impl.h
│ │ │ ├── hip_vec_fp32_impl.h
│ │ │ └── hip_vec_half_impl.h
│ │ ├── pytorch_extension_utils_rocm.h
│ │ ├── scalar_type.hpp
│ │ ├── sgl_flash_kernel_ops.h
│ │ ├── sgl_kernel_ops.h
│ │ ├── sgl_kernel_torch_shim.h
│ │ └── utils.h
│ ├── kernel-runner-setup.sh
│ ├── pyproject.toml
│ ├── pyproject_cpu.toml
│ ├── pyproject_musa.toml
│ ├── pyproject_rocm.toml
│ ├── python/
│ │ └── sgl_kernel/
│ │ ├── __init__.py
│ │ ├── _fa4_interface.py
│ │ ├── allreduce.py
│ │ ├── attention.py
│ │ ├── cutlass_moe.py
│ │ ├── elementwise.py
│ │ ├── expert_specialization.py
│ │ ├── flash_attn.py
│ │ ├── flash_mla.py
│ │ ├── gemm.py
│ │ ├── grammar.py
│ │ ├── kvcacheio.py
│ │ ├── load_utils.py
│ │ ├── mamba.py
│ │ ├── memory.py
│ │ ├── moe.py
│ │ ├── quantization/
│ │ │ ├── __init__.py
│ │ │ └── gguf.py
│ │ ├── sampling.py
│ │ ├── scalar_type.py
│ │ ├── sparse_flash_attn.py
│ │ ├── spatial.py
│ │ ├── speculative.py
│ │ ├── test_utils.py
│ │ ├── testing/
│ │ │ ├── __init__.py
│ │ │ └── rotary_embedding.py
│ │ ├── top_k.py
│ │ ├── utils.py
│ │ └── version.py
│ ├── rename_wheels.sh
│ ├── setup_musa.py
│ ├── setup_rocm.py
│ └── tests/
│ ├── conftest.py
│ ├── spatial/
│ │ └── test_greenctx_stream.py
│ ├── speculative/
│ │ ├── test_eagle_utils.py
│ │ ├── test_ngram_utils.py
│ │ └── test_speculative_sampling.py
│ ├── test_activation.py
│ ├── test_amd_deterministic_custom_allreduce.py
│ ├── test_amd_nccl_allreduce_determinism.py
│ ├── test_apply_token_bitmask_inplace.py
│ ├── test_awq_dequant.py
│ ├── test_bmm_fp8.py
│ ├── test_causal_conv1d.py
│ ├── test_copy.py
│ ├── test_custom_allreduce.py
│ ├── test_cutlass_mla.py
│ ├── test_cutlass_w4a8_moe_mm.py
│ ├── test_dsv3_fused_a_gemm.py
│ ├── test_dsv3_router_gemm.py
│ ├── test_es_fp8_blockwise_moe.py
│ ├── test_es_mxfp8_blockscaled_moe.py
│ ├── test_flash_attention.py
│ ├── test_flash_attn_sparse.py
│ ├── test_flashmla.py
│ ├── test_fp8_blockwise_gemm.py
│ ├── test_fp8_blockwise_moe.py
│ ├── test_fp8_gemm.py
│ ├── test_fused_qk_norm_rope.py
│ ├── test_gguf.py
│ ├── test_gptq_kernel.py
│ ├── test_hadamard.py
│ ├── test_int8_gemm.py
│ ├── test_kimi_k2_moe_fused_gate.py
│ ├── test_kvcacheio.py
│ ├── test_merge_state.py
│ ├── test_merge_state_v2.py
│ ├── test_moe_align.py
│ ├── test_moe_fused_gate.py
│ ├── test_moe_topk_sigmoid.py
│ ├── test_moe_topk_softmax.py
│ ├── test_mscclpp.py
│ ├── test_norm.py
│ ├── test_per_token_group_quant_8bit.py
│ ├── test_per_token_quant_fp8.py
│ ├── test_qserve_w4a8_per_chn_gemm.py
│ ├── test_qserve_w4a8_per_group_gemm.py
│ ├── test_sampling.py
│ ├── test_topk.py
│ ├── test_torch_defaults_reset.py
│ └── utils.py
├── sgl-model-gateway/
│ ├── .cargo/
│ │ └── config.toml
│ ├── Cargo.toml
│ ├── Makefile
│ ├── README.md
│ ├── benches/
│ │ ├── consistent_hash_bench.rs
│ │ ├── manual_policy_benchmark.rs
│ │ ├── request_processing.rs
│ │ ├── router_registry_bench.rs
│ │ ├── tree_benchmark.rs
│ │ └── wasm_middleware_latency.rs
│ ├── bindings/
│ │ ├── golang/
│ │ │ ├── .gitignore
│ │ │ ├── Cargo.toml
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── client.go
│ │ │ ├── client_test.go
│ │ │ ├── examples/
│ │ │ │ ├── oai_server/
│ │ │ │ │ ├── Makefile
│ │ │ │ │ ├── README.md
│ │ │ │ │ ├── config/
│ │ │ │ │ │ └── config.go
│ │ │ │ │ ├── docs/
│ │ │ │ │ │ └── benchmark_result.md
│ │ │ │ │ ├── go.sum
│ │ │ │ │ ├── handlers/
│ │ │ │ │ │ ├── chat.go
│ │ │ │ │ │ ├── health.go
│ │ │ │ │ │ └── models.go
│ │ │ │ │ ├── logger/
│ │ │ │ │ │ └── logger.go
│ │ │ │ │ ├── main.go
│ │ │ │ │ ├── models/
│ │ │ │ │ │ └── chat.go
│ │ │ │ │ ├── run.sh
│ │ │ │ │ ├── scripts/
│ │ │ │ │ │ ├── analyze_tpot.sh
│ │ │ │ │ │ ├── pprof_analysis.sh
│ │ │ │ │ │ ├── pprof_quick.sh
│ │ │ │ │ │ ├── pprof_test.sh
│ │ │ │ │ │ └── profile_tpot.sh
│ │ │ │ │ ├── service/
│ │ │ │ │ │ └── sglang.go
│ │ │ │ │ └── utils/
│ │ │ │ │ └── utils.go
│ │ │ │ ├── simple/
│ │ │ │ │ ├── main.go
│ │ │ │ │ └── run.sh
│ │ │ │ └── streaming/
│ │ │ │ ├── main.go
│ │ │ │ └── run.sh
│ │ │ ├── go.sum
│ │ │ ├── integration_test.go
│ │ │ ├── internal/
│ │ │ │ ├── ffi/
│ │ │ │ │ ├── batch_postprocessor.go
│ │ │ │ │ ├── client.go
│ │ │ │ │ ├── grpc_converter.go
│ │ │ │ │ ├── postprocessor.go
│ │ │ │ │ └── preprocessor.go
│ │ │ │ ├── grpc/
│ │ │ │ │ └── client_grpc.go
│ │ │ │ └── proto/
│ │ │ │ ├── sglang_scheduler.pb.go
│ │ │ │ └── sglang_scheduler_grpc.pb.go
│ │ │ └── src/
│ │ │ ├── client.rs
│ │ │ ├── error.rs
│ │ │ ├── grpc_converter.rs
│ │ │ ├── lib.rs
│ │ │ ├── memory.rs
│ │ │ ├── postprocessor.rs
│ │ │ ├── preprocessor.rs
│ │ │ ├── stream.rs
│ │ │ ├── tokenizer.rs
│ │ │ ├── tool_parser.rs
│ │ │ └── utils.rs
│ │ └── python/
│ │ ├── .coveragerc
│ │ ├── Cargo.toml
│ │ ├── MANIFEST.in
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ ├── setup.py
│ │ ├── src/
│ │ │ ├── lib.rs
│ │ │ └── sglang_router/
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── cli.py
│ │ │ ├── launch_router.py
│ │ │ ├── launch_server.py
│ │ │ ├── mini_lb.py
│ │ │ ├── router.py
│ │ │ ├── router_args.py
│ │ │ └── version.py
│ │ └── tests/
│ │ ├── conftest.py
│ │ ├── test_arg_parser.py
│ │ ├── test_router_config.py
│ │ ├── test_startup_sequence.py
│ │ └── test_validation.py
│ ├── build.rs
│ ├── e2e_test/
│ │ ├── __init__.py
│ │ ├── benchmarks/
│ │ │ ├── __init__.py
│ │ │ ├── conftest.py
│ │ │ ├── results.py
│ │ │ ├── summarize.py
│ │ │ ├── test_pd_perf.py
│ │ │ └── test_regular_perf.py
│ │ ├── chat_completions/
│ │ │ ├── __init__.py
│ │ │ ├── test_enable_thinking.py
│ │ │ ├── test_function_calling.py
│ │ │ ├── test_openai_server.py
│ │ │ ├── test_reasoning_content.py
│ │ │ └── test_validation.py
│ │ ├── conftest.py
│ │ ├── embeddings/
│ │ │ ├── __init__.py
│ │ │ ├── test_basic.py
│ │ │ └── test_correctness.py
│ │ ├── fixtures/
│ │ │ ├── __init__.py
│ │ │ ├── hooks.py
│ │ │ ├── markers.py
│ │ │ ├── pool.py
│ │ │ ├── ports.py
│ │ │ └── setup_backend.py
│ │ ├── infra/
│ │ │ ├── __init__.py
│ │ │ ├── constants.py
│ │ │ ├── gateway.py
│ │ │ ├── gpu_allocator.py
│ │ │ ├── gpu_monitor.py
│ │ │ ├── model_pool.py
│ │ │ ├── model_specs.py
│ │ │ ├── process_utils.py
│ │ │ ├── run_eval.py
│ │ │ ├── simple_eval_common.py
│ │ │ └── simple_eval_mmlu.py
│ │ ├── pyproject.toml
│ │ ├── responses/
│ │ │ ├── __init__.py
│ │ │ ├── test_basic_crud.py
│ │ │ ├── test_state_management.py
│ │ │ ├── test_streaming_events.py
│ │ │ ├── test_structured_output.py
│ │ │ └── test_tools_call.py
│ │ └── router/
│ │ ├── __init__.py
│ │ ├── test_mmlu.py
│ │ ├── test_pd_mmlu.py
│ │ └── test_worker_api.py
│ ├── examples/
│ │ └── wasm/
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── wasm-guest-auth/
│ │ │ ├── Cargo.toml
│ │ │ ├── README.md
│ │ │ ├── build.sh
│ │ │ └── src/
│ │ │ └── lib.rs
│ │ ├── wasm-guest-logging/
│ │ │ ├── Cargo.toml
│ │ │ ├── README.md
│ │ │ ├── build.sh
│ │ │ └── src/
│ │ │ └── lib.rs
│ │ └── wasm-guest-ratelimit/
│ │ ├── Cargo.toml
│ │ ├── README.md
│ │ ├── build.sh
│ │ └── src/
│ │ └── lib.rs
│ ├── pytest.ini
│ ├── rustfmt.toml
│ ├── scripts/
│ │ ├── generate_gateway_release_notes.sh
│ │ ├── generate_vision_golden.py
│ │ ├── run_benchmarks.py
│ │ └── setup-sccache.sh
│ ├── src/
│ │ ├── app_context.rs
│ │ ├── config/
│ │ │ ├── builder.rs
│ │ │ ├── mod.rs
│ │ │ ├── types.rs
│ │ │ └── validation.rs
│ │ ├── core/
│ │ │ ├── circuit_breaker.rs
│ │ │ ├── error.rs
│ │ │ ├── job_queue.rs
│ │ │ ├── metrics_aggregator.rs
│ │ │ ├── mod.rs
│ │ │ ├── model_card.rs
│ │ │ ├── model_type.rs
│ │ │ ├── retry.rs
│ │ │ ├── steps/
│ │ │ │ ├── mcp_registration.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── tokenizer_registration.rs
│ │ │ │ ├── wasm_module_registration.rs
│ │ │ │ ├── wasm_module_removal.rs
│ │ │ │ ├── worker/
│ │ │ │ │ ├── external/
│ │ │ │ │ │ ├── create_workers.rs
│ │ │ │ │ │ ├── discover_models.rs
│ │ │ │ │ │ └── mod.rs
│ │ │ │ │ ├── local/
│ │ │ │ │ │ ├── create_worker.rs
│ │ │ │ │ │ ├── detect_connection.rs
│ │ │ │ │ │ ├── discover_dp.rs
│ │ │ │ │ │ ├── discover_metadata.rs
│ │ │ │ │ │ ├── find_worker_to_update.rs
│ │ │ │ │ │ ├── find_workers_to_remove.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── remove_from_policy_registry.rs
│ │ │ │ │ │ ├── remove_from_worker_registry.rs
│ │ │ │ │ │ ├── submit_tokenizer_job.rs
│ │ │ │ │ │ ├── update_policies_for_worker.rs
│ │ │ │ │ │ ├── update_remaining_policies.rs
│ │ │ │ │ │ └── update_worker_properties.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ └── shared/
│ │ │ │ │ ├── activate.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── register.rs
│ │ │ │ │ └── update_policies.rs
│ │ │ │ ├── workflow_data.rs
│ │ │ │ └── workflow_engines.rs
│ │ │ ├── token_bucket.rs
│ │ │ ├── worker.rs
│ │ │ ├── worker_builder.rs
│ │ │ ├── worker_manager.rs
│ │ │ ├── worker_registry.rs
│ │ │ └── worker_service.rs
│ │ ├── lib.rs
│ │ ├── main.rs
│ │ ├── middleware.rs
│ │ ├── observability/
│ │ │ ├── events.rs
│ │ │ ├── gauge_histogram.rs
│ │ │ ├── inflight_tracker.rs
│ │ │ ├── logging.rs
│ │ │ ├── metrics.rs
│ │ │ ├── mod.rs
│ │ │ └── otel_trace.rs
│ │ ├── policies/
│ │ │ ├── bucket.rs
│ │ │ ├── cache_aware.rs
│ │ │ ├── consistent_hashing.rs
│ │ │ ├── factory.rs
│ │ │ ├── manual.rs
│ │ │ ├── mod.rs
│ │ │ ├── power_of_two.rs
│ │ │ ├── prefix_hash.rs
│ │ │ ├── random.rs
│ │ │ ├── registry.rs
│ │ │ ├── round_robin.rs
│ │ │ ├── tree.rs
│ │ │ └── utils.rs
│ │ ├── routers/
│ │ │ ├── conversations/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── error.rs
│ │ │ ├── factory.rs
│ │ │ ├── grpc/
│ │ │ │ ├── client.rs
│ │ │ │ ├── common/
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── response_collection.rs
│ │ │ │ │ ├── response_formatting.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── context.rs
│ │ │ │ │ │ ├── handlers.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── streaming.rs
│ │ │ │ │ │ └── utils.rs
│ │ │ │ │ └── stages/
│ │ │ │ │ ├── client_acquisition.rs
│ │ │ │ │ ├── dispatch_metadata.rs
│ │ │ │ │ ├── helpers.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── request_execution.rs
│ │ │ │ │ └── worker_selection.rs
│ │ │ │ ├── context.rs
│ │ │ │ ├── harmony/
│ │ │ │ │ ├── builder.rs
│ │ │ │ │ ├── detector.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── parser.rs
│ │ │ │ │ ├── processor.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── common.rs
│ │ │ │ │ │ ├── execution.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ │ └── streaming.rs
│ │ │ │ │ ├── stages/
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ ├── streaming.rs
│ │ │ │ │ └── types.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── pd_router.rs
│ │ │ │ ├── pipeline.rs
│ │ │ │ ├── proto_wrapper.rs
│ │ │ │ ├── regular/
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── processor.rs
│ │ │ │ │ ├── responses/
│ │ │ │ │ │ ├── common.rs
│ │ │ │ │ │ ├── conversions.rs
│ │ │ │ │ │ ├── handlers.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ │ └── streaming.rs
│ │ │ │ │ ├── stages/
│ │ │ │ │ │ ├── chat/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── classify/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── embedding/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── generate/
│ │ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ │ ├── mod.rs
│ │ │ │ │ │ ├── preparation.rs
│ │ │ │ │ │ ├── request_building.rs
│ │ │ │ │ │ └── response_processing.rs
│ │ │ │ │ └── streaming.rs
│ │ │ │ ├── router.rs
│ │ │ │ └── utils.rs
│ │ │ ├── header_utils.rs
│ │ │ ├── http/
│ │ │ │ ├── mod.rs
│ │ │ │ ├── pd_router.rs
│ │ │ │ ├── pd_types.rs
│ │ │ │ └── router.rs
│ │ │ ├── mcp_utils.rs
│ │ │ ├── mesh/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── mod.rs
│ │ │ ├── openai/
│ │ │ │ ├── context.rs
│ │ │ │ ├── mod.rs
│ │ │ │ ├── provider.rs
│ │ │ │ ├── responses/
│ │ │ │ │ ├── accumulator.rs
│ │ │ │ │ ├── common.rs
│ │ │ │ │ ├── mcp.rs
│ │ │ │ │ ├── mod.rs
│ │ │ │ │ ├── non_streaming.rs
│ │ │ │ │ ├── streaming.rs
│ │ │ │ │ ├── tool_handler.rs
│ │ │ │ │ └── utils.rs
│ │ │ │ └── router.rs
│ │ │ ├── parse/
│ │ │ │ ├── handlers.rs
│ │ │ │ └── mod.rs
│ │ │ ├── persistence_utils.rs
│ │ │ ├── router_manager.rs
│ │ │ └── tokenize/
│ │ │ ├── handlers.rs
│ │ │ └── mod.rs
│ │ ├── server.rs
│ │ ├── service_discovery.rs
│ │ ├── version.rs
│ │ └── wasm/
│ │ ├── mod.rs
│ │ └── route.rs
│ └── tests/
│ ├── api/
│ │ ├── api_endpoints_test.rs
│ │ ├── mod.rs
│ │ ├── parser_endpoints_test.rs
│ │ ├── request_formats_test.rs
│ │ ├── responses_api_test.rs
│ │ └── streaming_tests.rs
│ ├── api_tests.rs
│ ├── common/
│ │ ├── mock_mcp_server.rs
│ │ ├── mock_openai_server.rs
│ │ ├── mock_worker.rs
│ │ ├── mod.rs
│ │ ├── redis_test_server.rs
│ │ ├── streaming_helpers.rs
│ │ ├── test_app.rs
│ │ ├── test_certs.rs
│ │ ├── test_config.rs
│ │ └── tls_mock_worker.rs
│ ├── inflight_tracker_test.rs
│ ├── load_guard_raii_test.rs
│ ├── mcp_test.rs
│ ├── metrics_aggregator_test.rs
│ ├── otel_tracing_test.rs
│ ├── reliability/
│ │ ├── circuit_breaker_test.rs
│ │ ├── fault_tolerance_test.rs
│ │ ├── mod.rs
│ │ ├── rate_limiting_test.rs
│ │ └── retries_test.rs
│ ├── reliability_tests.rs
│ ├── routing/
│ │ ├── cache_aware_backward_compat_test.rs
│ │ ├── header_forwarding_test.rs
│ │ ├── load_balancing_test.rs
│ │ ├── manual_routing_test.rs
│ │ ├── mod.rs
│ │ ├── payload_size_test.rs
│ │ ├── pd_routing_test.rs
│ │ ├── policy_registry_integration.rs
│ │ ├── power_of_two_test.rs
│ │ ├── service_discovery_test.rs
│ │ ├── test_openai_routing.rs
│ │ ├── test_pd_routing.rs
│ │ └── worker_management_test.rs
│ ├── routing_tests.rs
│ ├── security/
│ │ ├── auth_integration_test.rs
│ │ ├── auth_test.rs
│ │ ├── mod.rs
│ │ └── mtls_test.rs
│ ├── security_tests.rs
│ ├── spec/
│ │ ├── chat_completion.rs
│ │ ├── chat_message.rs
│ │ ├── embedding.rs
│ │ ├── mod.rs
│ │ ├── rerank.rs
│ │ └── responses.rs
│ ├── spec_test.rs
│ └── wasm_test.rs
└── test/
├── README.md
├── lm_eval_configs/
│ ├── NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.yaml
│ ├── NVIDIA-Nemotron-3-Nano-30B-A3B-FP8.yaml
│ └── Qwen3.5-397B-A17B.yaml
├── manual/
│ ├── ascend/
│ │ ├── test_ascend_deepseek_mtp.py
│ │ ├── test_ascend_w8a8_quantization.py
│ │ └── test_mindspore_models.py
│ ├── cpu/
│ │ └── test_comm.py
│ ├── debug_utils/
│ │ └── test_log_parser.py
│ ├── entrypoints/
│ │ └── http_server/
│ │ └── test_abort_request.py
│ ├── ep/
│ │ ├── test_deepep_internode.py
│ │ ├── test_deepep_intranode.py
│ │ ├── test_deepep_low_latency.py
│ │ ├── test_eplb.py
│ │ ├── test_moe_deepep.py
│ │ ├── test_moe_deepep_eval_accuracy_large.py
│ │ ├── test_mooncake_expert_backup.py
│ │ └── test_nixl_ep.py
│ ├── hicache/
│ │ ├── test_disaggregation_hicache.py
│ │ └── test_pp_with_hicache.py
│ ├── kv_transfer/
│ │ └── test_mooncake_transfer_engine.py
│ ├── lang_frontend/
│ │ ├── test_bind_cache.py
│ │ ├── test_choices.py
│ │ ├── test_jump_forward.py
│ │ ├── test_openai_backend.py
│ │ ├── test_separate_reasoning.py
│ │ └── test_separate_reasoning_execution.py
│ ├── layers/
│ │ ├── attention/
│ │ │ └── nsa/
│ │ │ ├── test_act_quant_triton.py
│ │ │ ├── test_get_k_scale_triton_kernel.py
│ │ │ └── test_index_buf_accessor.py
│ │ └── moe/
│ │ ├── test_moe_runners_1gpu.py
│ │ └── test_moe_runners_4gpu.py
│ ├── lora/
│ │ ├── test_lora_cuda_graph.py
│ │ ├── test_lora_llama4.py
│ │ ├── test_lora_ops.py
│ │ ├── test_lora_qwen3_vl.py
│ │ ├── test_lora_spec_decoding.py
│ │ └── test_torch_backend.py
│ ├── models/
│ │ ├── test_clip_models.py
│ │ ├── test_falcon_h1_models.py
│ │ ├── test_gme_qwen_models.py
│ │ ├── test_grok_models.py
│ │ ├── test_kimi_k2_models.py
│ │ ├── test_llama4_models.py
│ │ ├── test_mistral_large3_basic.py
│ │ ├── test_mtp_models.py
│ │ └── test_unsloth_models.py
│ ├── nightly/
│ │ ├── test_deepseek_v31_perf.py
│ │ ├── test_deepseek_v32_perf.py
│ │ ├── test_text_models_gsm8k_eval.py
│ │ ├── test_text_models_perf.py
│ │ ├── test_vlms_mmmu_eval.py
│ │ ├── test_vlms_perf.py
│ │ ├── test_vlms_piecewise_cuda_graph.py
│ │ ├── test_vlms_vit_cuda_graph.py
│ │ └── test_vlms_vit_flashinfer_cudnn.py
│ ├── openai_server/
│ │ └── features/
│ │ ├── test_cache_report.py
│ │ ├── test_continuous_usage_stats.py
│ │ └── test_structural_tag.py
│ ├── piecewise_cudagraph/
│ │ └── test_disaggregation_piecewise_cuda_graph.py
│ ├── quant/
│ │ └── test_fp8_kvcache.py
│ ├── test_async_dynamic_batch_tokenizer.py
│ ├── test_async_mm_data_processor.py
│ ├── test_config_integration.py
│ ├── test_custom_allreduce.py
│ ├── test_deepseek_chat_templates.py
│ ├── test_double_sparsity.py
│ ├── test_expert_distribution.py
│ ├── test_expert_location_updater.py
│ ├── test_fim_completion.py
│ ├── test_forward_split_prefill.py
│ ├── test_get_weights_by_name.py
│ ├── test_health_check.py
│ ├── test_kv_events.py
│ ├── test_logprobs.py
│ ├── test_mla_tp.py
│ ├── test_modelopt.py
│ ├── test_modelopt_fp8kvcache.py
│ ├── test_models_from_modelscope.py
│ ├── test_mori_transfer_engine_e2e.py
│ ├── test_mscclpp.py
│ ├── test_quick_allreduce.py
│ ├── test_ray_engine.py
│ ├── test_sagemaker_server.py
│ ├── test_schedule_policy.py
│ ├── test_srt_engine_with_quant_args.py
│ ├── test_tokenizer_batch_encode.py
│ ├── test_tokenizer_manager.py
│ ├── test_torch_flex_attention_backend.py
│ ├── test_torch_tp.py
│ ├── test_tracing.py
│ ├── test_triton_attention_rocm_mla.py
│ ├── test_triton_moe_wna16.py
│ ├── test_trtllm_fp8_kv_kernel.py
│ ├── test_two_batch_overlap.py
│ ├── test_vertex_endpoint.py
│ ├── test_vlm_accuracy.py
│ ├── test_wave_attention_backend.py
│ ├── test_weight_validation.py
│ ├── test_weight_version.py
│ └── vlm/
│ └── test_anthropic_vision.py
├── pytest.ini
├── registered/
│ ├── 4-gpu-models/
│ │ ├── test_deepseek_v3_cutedsl_4gpu.py
│ │ ├── test_gpt_oss_4gpu.py
│ │ ├── test_nvidia_nemotron_3_super_nvfp4.py
│ │ ├── test_qwen35_models.py
│ │ ├── test_qwen3_next_models.py
│ │ └── test_qwen3_next_models_mtp.py
│ ├── 8-gpu-models/
│ │ ├── test_deepseek_v31.py
│ │ ├── test_deepseek_v32.py
│ │ ├── test_deepseek_v32_basic.py
│ │ ├── test_deepseek_v32_cp_single_node.py
│ │ ├── test_deepseek_v32_mtp.py
│ │ ├── test_deepseek_v3_basic.py
│ │ ├── test_deepseek_v3_mtp.py
│ │ ├── test_glm_46.py
│ │ ├── test_glm_46_fp8.py
│ │ ├── test_gpt_oss_120b.py
│ │ ├── test_kimi_k25.py
│ │ ├── test_llama4.py
│ │ ├── test_mimo_models.py
│ │ ├── test_minimax_m25.py
│ │ ├── test_mistral_large3.py
│ │ ├── test_nvidia_nemotron_3_super_bf16.py
│ │ ├── test_nvidia_nemotron_3_super_nightly.py
│ │ ├── test_qwen35.py
│ │ ├── test_qwen3_235b.py
│ │ └── test_ring_2_5_1t.py
│ ├── README.md
│ ├── amd/
│ │ ├── accuracy/
│ │ │ ├── mi30x/
│ │ │ │ ├── test_deepseek_r1_eval_amd.py
│ │ │ │ ├── test_deepseek_v31_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_dp_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_mtp_eval_amd.py
│ │ │ │ ├── test_deepseek_v32_tc_eval_amd.py
│ │ │ │ ├── test_glm5_eval_amd.py
│ │ │ │ ├── test_gpt_oss_eval_amd.py
│ │ │ │ ├── test_grok1_fp8_eval_amd.py
│ │ │ │ ├── test_grok1_int4_eval_amd.py
│ │ │ │ ├── test_grok2_eval_amd.py
│ │ │ │ ├── test_grok_eval_amd.py
│ │ │ │ ├── test_gsm8k_eval_amd.py
│ │ │ │ ├── test_kimi_k25_eval_amd.py
│ │ │ │ ├── test_kimi_k2_eval_amd.py
│ │ │ │ ├── test_minimax_m25_eval_amd.py
│ │ │ │ ├── test_qwen35_eval_amd.py
│ │ │ │ └── test_vlms_mmmu_eval_amd.py
│ │ │ └── mi35x/
│ │ │ ├── test_deepseek_r1_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_eval_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_dp_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_eval_mi35x.py
│ │ │ ├── test_deepseek_v32_mtp_eval_mi35x.py
│ │ │ ├── test_glm5_eval_mi35x.py
│ │ │ ├── test_gpt_oss_eval_mi35x.py
│ │ │ ├── test_grok1_int4_eval_mi35x.py
│ │ │ ├── test_grok2_eval_mi35x.py
│ │ │ ├── test_kimi_k25_aiter_mla_eval_mi35x.py
│ │ │ ├── test_kimi_k25_eval_mi35x.py
│ │ │ ├── test_kimi_k25_mxfp4_eval_mi35x.py
│ │ │ ├── test_kimi_k2_eval_mi35x.py
│ │ │ ├── test_minimax_m25_eval_mi35x.py
│ │ │ ├── test_qwen35_eval_mi35x.py
│ │ │ └── test_qwen3_coder_next_eval_mi35x.py
│ │ ├── disaggregation/
│ │ │ ├── test_disaggregation_basic.py
│ │ │ └── test_disaggregation_pp.py
│ │ ├── perf/
│ │ │ ├── mi30x/
│ │ │ │ ├── test_deepseek_v31_perf.py
│ │ │ │ ├── test_deepseek_v32_basic_perf_amd.py
│ │ │ │ ├── test_deepseek_v32_mtp_perf_amd.py
│ │ │ │ ├── test_deepseek_v3_perf.py
│ │ │ │ ├── test_grok1_fp8_perf.py
│ │ │ │ ├── test_grok1_int4_perf.py
│ │ │ │ ├── test_grok2_perf.py
│ │ │ │ ├── test_text_models_perf_amd.py
│ │ │ │ └── test_vlms_perf_amd.py
│ │ │ └── mi35x/
│ │ │ ├── test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py
│ │ │ ├── test_deepseek_r1_mxfp4_perf_mi35x.py
│ │ │ ├── test_deepseek_v32_basic_perf_mi35x.py
│ │ │ ├── test_deepseek_v32_mtp_perf_mi35x.py
│ │ │ ├── test_grok1_int4_perf_mi35x.py
│ │ │ └── test_grok2_perf_mi35x.py
│ │ ├── test_deepseek_r1_mxfp4_8gpu.py
│ │ ├── test_deepseek_v32_basic.py
│ │ ├── test_deepseek_v32_mtp.py
│ │ ├── test_deepseek_v3_basic.py
│ │ ├── test_deepseek_v3_basic_kv_fp8.py
│ │ ├── test_deepseek_v3_mtp.py
│ │ ├── test_deepseek_v3_mtp_kv_fp8.py
│ │ ├── test_kimi_k25_mxfp4.py
│ │ ├── test_kimi_k2_instruct.py
│ │ ├── test_moriep_small.py
│ │ ├── test_qwen3_coder_next_8gpu.py
│ │ ├── test_qwen3_instruct.py
│ │ ├── test_qwen3_instruct_fp8.py
│ │ ├── test_qwen3_instruct_mxfp4.py
│ │ └── test_zimage_turbo.py
│ ├── ascend/
│ │ ├── basic_function/
│ │ │ ├── HiCache/
│ │ │ │ ├── test_npu_hierarchical_cache.py
│ │ │ │ ├── test_npu_hierarchical_cache_mla.py
│ │ │ │ ├── test_npu_hierarchical_cache_mutually_exclusive.py
│ │ │ │ ├── test_npu_hierarchical_cache_ttft_mha.py
│ │ │ │ └── test_npu_radix_cache.py
│ │ │ ├── parallel_strategy/
│ │ │ │ └── expert_parallelism/
│ │ │ │ ├── test_npu_deepep_auto_deepseek_v3_2_w8a8.py
│ │ │ │ ├── test_npu_deepep_auto_qwen3_480b.py
│ │ │ │ ├── test_npu_deepep_auto_qwen3_next.py
│ │ │ │ ├── test_npu_deepep_low_latency_deepseek_v3_2_w8a8.py
│ │ │ │ ├── test_npu_deepep_low_latency_qwen3_480b.py
│ │ │ │ └── test_npu_deepep_low_latency_qwen3_next.py
│ │ │ ├── parameter/
│ │ │ │ ├── deepseek_coder.json
│ │ │ │ ├── test_npu_fim_completion.py
│ │ │ │ ├── test_npu_log_level.py
│ │ │ │ ├── test_npu_no_chunked_prefill.py
│ │ │ │ ├── test_npu_no_overlap_scheduler.py
│ │ │ │ ├── test_npu_original_logprobs.py
│ │ │ │ └── test_npu_warmups.py
│ │ │ └── speculative_inference/
│ │ │ └── test_npu_eagle3.py
│ │ ├── embedding_models/
│ │ │ └── test_npu_bge_large_en_v1_5.py
│ │ ├── interface/
│ │ │ ├── test_npu_api.py
│ │ │ ├── test_npu_api_abort_request.py
│ │ │ ├── test_npu_api_encode.py
│ │ │ ├── test_npu_enable_thinking.py
│ │ │ ├── test_npu_matched_stop.py
│ │ │ ├── test_npu_openai_function_calling.py
│ │ │ ├── test_npu_openai_server_ignore_eos.py
│ │ │ └── test_npu_penalty.py
│ │ ├── llm_models/
│ │ │ ├── test_npu_afm_4_5b.py
│ │ │ ├── test_npu_baichuan2_13b_chat.py
│ │ │ ├── test_npu_c4ai_command_r_v01.py
│ │ │ ├── test_npu_chatglm2_6b.py
│ │ │ ├── test_npu_deepseek_v3_2_exp_w8a8.py
│ │ │ ├── test_npu_exaone_3.py
│ │ │ ├── test_npu_gemma_3_4b_it_llm.py
│ │ │ ├── test_npu_glm4_9b_chat.py
│ │ │ ├── test_npu_granite_3_0_3b_a800m.py
│ │ │ ├── test_npu_granite_3_1_8b.py
│ │ │ ├── test_npu_grok_2.py
│ │ │ ├── test_npu_internlm2_7b.py
│ │ │ ├── test_npu_ling_lite.py
│ │ │ ├── test_npu_llama4_scount_17b_16e.py
│ │ │ ├── test_npu_llama_2_7b.py
│ │ │ ├── test_npu_mimo_7b_rl.py
│ │ │ ├── test_npu_minicpm3_4b.py
│ │ │ ├── test_npu_mistral_7b.py
│ │ │ ├── test_npu_persimmon_8b_chat.py
│ │ │ ├── test_npu_phi_4_multimodal_llm.py
│ │ │ ├── test_npu_qwen3_0_6b.py
│ │ │ ├── test_npu_qwen3_1_7b_gptq_int8.py
│ │ │ ├── test_npu_qwen3_235b_a22b_w8a8.py
│ │ │ ├── test_npu_qwen3_30b.py
│ │ │ ├── test_npu_qwen3_30b_w4a4.py
│ │ │ ├── test_npu_qwen3_32b.py
│ │ │ ├── test_npu_qwen3_coder_480b_a35b.py
│ │ │ ├── test_npu_qwq_32b_w8a8.py
│ │ │ ├── test_npu_smollm_1_7b.py
│ │ │ ├── test_npu_stablelm_2_1_6b.py
│ │ │ └── tool_chat_template_c4ai_command_r_v01.jinja
│ │ ├── rerank_models/
│ │ │ └── test_npu_bge_reranker_v2_m3.py
│ │ ├── reward_models/
│ │ │ ├── test_npu_gemma_2_27b_v0_2.py
│ │ │ ├── test_npu_internlm2_7b_reward.py
│ │ │ └── test_npu_llama_3_1_8b_v0_2.py
│ │ ├── test_npu_memory_consumption.py
│ │ └── vlm_models/
│ │ ├── mmmu-val.yaml
│ │ ├── test_npu_deepseek_vl2.py
│ │ ├── test_npu_gemma_3_4b_it.py
│ │ ├── test_npu_janus_pro_1b.py
│ │ ├── test_npu_janus_pro_7b.py
│ │ ├── test_npu_kimi_vl_a3b_instruct.py
│ │ ├── test_npu_llama_3_2_11b_vision_instruct.py
│ │ ├── test_npu_mimo_vl_7b_rl.py
│ │ ├── test_npu_minicpm_o_2_6.py
│ │ ├── test_npu_minicpm_v_2_6.py
│ │ ├── test_npu_mistral_small_3_1_24b_instruct_2503.py
│ │ ├── test_npu_phi4_multimodal_instruct.py
│ │ ├── test_npu_qwen2_5_vl_3b_instruct.py
│ │ ├── test_npu_qwen2_5_vl_72b_instruct.py
│ │ ├── test_npu_qwen3_vl_235b_a22b_instruct.py
│ │ ├── test_npu_qwen3_vl_30b_a3b_instruct.py
│ │ ├── test_npu_qwen3_vl_4b_instruct.py
│ │ └── test_npu_qwen3_vl_8b_instruct.py
│ ├── attention/
│ │ ├── test_chunk_gated_delta_rule.py
│ │ ├── test_create_kvindices.py
│ │ ├── test_fa3.py
│ │ ├── test_flash_attention_4.py
│ │ ├── test_hybrid_attn_backend.py
│ │ ├── test_kda_kernels.py
│ │ ├── test_local_attn.py
│ │ ├── test_torch_native_attention_backend.py
│ │ ├── test_triton_attention_backend.py
│ │ ├── test_triton_attention_kernels.py
│ │ ├── test_triton_sliding_window.py
│ │ └── test_wave_attention_kernels.py
│ ├── backends/
│ │ ├── test_deepseek_r1_fp8_trtllm_backend.py
│ │ ├── test_deepseek_v3_fp4_cutlass_moe.py
│ │ ├── test_flashinfer_trtllm_gen_attn_backend.py
│ │ ├── test_flashinfer_trtllm_gen_moe_backend.py
│ │ ├── test_qwen3_fp4_trtllm_gen_moe.py
│ │ └── test_torch_compile.py
│ ├── bench_fn/
│ │ ├── test_bench_serving_functionality.py
│ │ └── test_benchmark_datasets_api.py
│ ├── constrained_decoding/
│ │ └── test_constrained_decoding.py
│ ├── core/
│ │ ├── test_cpp_radix_cache.py
│ │ ├── test_deepseek_v3_deterministic.py
│ │ ├── test_deterministic.py
│ │ ├── test_gpt_oss_1gpu.py
│ │ ├── test_gpt_oss_sm120.py
│ │ ├── test_hidden_states.py
│ │ ├── test_page_size.py
│ │ ├── test_qwen3_next_deterministic.py
│ │ ├── test_request_queue_validation.py
│ │ ├── test_score_api.py
│ │ ├── test_srt_endpoint.py
│ │ └── test_srt_engine.py
│ ├── debug_utils/
│ │ ├── comparator/
│ │ │ ├── __init__.py
│ │ │ ├── aligner/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── entrypoint/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ └── test_planner.py
│ │ │ │ ├── reorderer/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ └── test_planner.py
│ │ │ │ ├── test_axis_aligner.py
│ │ │ │ ├── token_aligner/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conftest.py
│ │ │ │ │ ├── test_aux_loader.py
│ │ │ │ │ ├── test_aux_plugins.py
│ │ │ │ │ ├── test_concat_steps.py
│ │ │ │ │ ├── test_executor.py
│ │ │ │ │ ├── test_planner.py
│ │ │ │ │ └── test_thd_seq_lens_loader.py
│ │ │ │ └── unsharder/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_executor.py
│ │ │ │ ├── test_parallel_info.py
│ │ │ │ └── test_planner.py
│ │ │ ├── conftest.py
│ │ │ ├── dims_spec/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── test_dim_parser.py
│ │ │ │ ├── test_dims_parser.py
│ │ │ │ ├── test_tensor_naming.py
│ │ │ │ └── test_types.py
│ │ │ ├── tensor_comparator/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conftest.py
│ │ │ │ ├── test_comparator.py
│ │ │ │ ├── test_formatter.py
│ │ │ │ └── test_types.py
│ │ │ ├── test_bundle_comparator.py
│ │ │ ├── test_bundle_matcher.py
│ │ │ ├── test_display.py
│ │ │ ├── test_dp_utils.py
│ │ │ ├── test_dump_loader.py
│ │ │ ├── test_entrypoint.py
│ │ │ ├── test_log_sink.py
│ │ │ ├── test_manually_verify.py
│ │ │ ├── test_meta_overrider.py
│ │ │ ├── test_model_validation.py
│ │ │ ├── test_output_types.py
│ │ │ ├── test_per_token_visualizer.py
│ │ │ ├── test_preset.py
│ │ │ ├── test_utils.py
│ │ │ ├── test_visualizer.py
│ │ │ └── testing_helpers.py
│ │ ├── source_patcher/
│ │ │ ├── conftest.py
│ │ │ ├── test_code_patcher.py
│ │ │ ├── test_dumper_integration.py
│ │ │ └── test_source_editor.py
│ │ ├── test_crash_dump.py
│ │ ├── test_cuda_coredump_smoke.py
│ │ ├── test_dump_comparator.py
│ │ ├── test_dump_loader.py
│ │ ├── test_dumper.py
│ │ ├── test_engine_dumper_comparator_e2e.py
│ │ ├── test_schedule_simulator.py
│ │ ├── test_soft_watchdog.py
│ │ └── test_tensor_dump_forward_hook.py
│ ├── disaggregation/
│ │ ├── test_disaggregation_basic.py
│ │ ├── test_disaggregation_decode_offload.py
│ │ └── test_specv2_kvcache_offloading.py
│ ├── distributed/
│ │ ├── test_data_parallelism.py
│ │ ├── test_disaggregation_aarch64.py
│ │ ├── test_disaggregation_different_tp.py
│ │ ├── test_disaggregation_dp_attention.py
│ │ ├── test_disaggregation_hybrid_attention.py
│ │ ├── test_disaggregation_pp.py
│ │ ├── test_dp_attention.py
│ │ ├── test_dp_attention_large.py
│ │ ├── test_epd_disaggregation.py
│ │ ├── test_load_weights_from_remote_instance.py
│ │ ├── test_load_weights_from_remote_instance_npu.py
│ │ ├── test_parallel_state.py
│ │ └── test_pp_single_node.py
│ ├── dllm/
│ │ ├── test_llada2_mini.py
│ │ └── test_llada2_mini_amd.py
│ ├── embedding/
│ │ ├── test_embedding_models.py
│ │ ├── test_encoder_embedding_models.py
│ │ ├── test_input_embeddings.py
│ │ ├── test_input_embeds_chunked.py
│ │ └── test_openai_embedding.py
│ ├── ep/
│ │ ├── test_deepep_large.py
│ │ ├── test_deepep_small.py
│ │ └── test_mooncake_ep_small.py
│ ├── eval/
│ │ ├── test_eval_accuracy_large.py
│ │ ├── test_moe_eval_accuracy_large.py
│ │ ├── test_text_models_gsm8k_eval.py
│ │ └── test_vlms_mmmu_eval.py
│ ├── function_call/
│ │ └── test_kimik2_detector.py
│ ├── hicache/
│ │ ├── test_hicache_storage.py
│ │ ├── test_hicache_storage_3fs_backend.py
│ │ ├── test_hicache_storage_file_backend.py
│ │ ├── test_hicache_storage_mooncake_backend.py
│ │ ├── test_hicache_storage_runtime_attach_detach.py
│ │ └── test_hicache_variants.py
│ ├── kernels/
│ │ ├── test_fp4_moe.py
│ │ ├── test_fused_topk_deepseek.py
│ │ └── test_nsa_indexer.py
│ ├── layers/
│ │ ├── mamba/
│ │ │ ├── conftest.py
│ │ │ ├── test_causal_conv1d.py
│ │ │ ├── test_mamba2_mixer.py
│ │ │ ├── test_mamba_ssm.py
│ │ │ └── test_mamba_ssm_ssd.py
│ │ └── test_fla_layernorm_guard.py
│ ├── lora/
│ │ ├── test_chunked_sgmv_backend.py
│ │ ├── test_embedding_lora_support.py
│ │ ├── test_fused_moe_lora_kernel.py
│ │ ├── test_lora_backend.py
│ │ ├── test_lora_eviction.py
│ │ ├── test_lora_eviction_policy.py
│ │ ├── test_lora_hf_sgl_logprob_diff.py
│ │ ├── test_lora_openai_api.py
│ │ ├── test_lora_openai_compatible.py
│ │ ├── test_lora_overlap_loading.py
│ │ ├── test_lora_qwen3.py
│ │ ├── test_lora_radix_cache.py
│ │ ├── test_lora_tied_lm_head.py
│ │ ├── test_lora_tp.py
│ │ ├── test_lora_update.py
│ │ └── test_multi_lora_backend.py
│ ├── metrics/
│ │ ├── test_metrics.py
│ │ └── test_priority_metrics.py
│ ├── mla/
│ │ ├── test_flashmla.py
│ │ ├── test_mla.py
│ │ ├── test_mla_deepseek_v3.py
│ │ ├── test_mla_flashinfer.py
│ │ ├── test_mla_fp8.py
│ │ └── test_mla_int8_deepseek_v3.py
│ ├── model_loading/
│ │ ├── test_external_models.py
│ │ └── test_utils_update_weights.py
│ ├── models/
│ │ ├── test_compressed_tensors_models.py
│ │ ├── test_cross_encoder_models.py
│ │ ├── test_dummy_grok_models.py
│ │ ├── test_generation_models.py
│ │ ├── test_gpt_oss_models_pcg.py
│ │ ├── test_kimi_linear_models.py
│ │ ├── test_kimi_linear_models_pcg.py
│ │ ├── test_ministral3_models.py
│ │ ├── test_nvidia_nemotron_3_nano.py
│ │ ├── test_nvidia_nemotron_nano_v2.py
│ │ ├── test_nvidia_nemotron_nano_v2_vl.py
│ │ ├── test_qwen3_next_models_fp4.py
│ │ ├── test_qwen3_next_models_pcg.py
│ │ ├── test_qwen_models.py
│ │ ├── test_reward_models.py
│ │ ├── test_transformers_models.py
│ │ └── test_vlm_models.py
│ ├── moe/
│ │ ├── test_cutedsl_moe.py
│ │ ├── test_fused_moe.py
│ │ ├── test_glm4_moe_models.py
│ │ ├── test_moe_ep.py
│ │ ├── test_torch_compile_moe.py
│ │ ├── test_triton_fused_moe.py
│ │ └── test_triton_moe_channel_fp8_kernel.py
│ ├── openai_server/
│ │ ├── basic/
│ │ │ ├── test_anthropic_server.py
│ │ │ ├── test_openai_server.py
│ │ │ ├── test_protocol.py
│ │ │ ├── test_serving_chat.py
│ │ │ ├── test_serving_completions.py
│ │ │ └── test_serving_rerank.py
│ │ ├── features/
│ │ │ ├── test_enable_thinking.py
│ │ │ ├── test_json_mode.py
│ │ │ ├── test_openai_server_ebnf.py
│ │ │ ├── test_openai_server_hidden_states.py
│ │ │ └── test_reasoning_content.py
│ │ ├── function_call/
│ │ │ ├── test_anthropic_tool_use.py
│ │ │ ├── test_openai_function_calling.py
│ │ │ └── test_tool_choice.py
│ │ └── validation/
│ │ ├── test_large_max_new_tokens.py
│ │ ├── test_matched_stop.py
│ │ ├── test_openai_server_ignore_eos.py
│ │ └── test_request_length_validation.py
│ ├── ops/
│ │ ├── test_aiter_allreduce_fusion_amd.py
│ │ └── test_repeat_interleave.py
│ ├── perf/
│ │ ├── test_bench_one_batch_1gpu.py
│ │ ├── test_bench_one_batch_2gpu.py
│ │ ├── test_bench_serving_1gpu_large.py
│ │ ├── test_bench_serving_1gpu_part1.py
│ │ ├── test_bench_serving_1gpu_part2.py
│ │ ├── test_bench_serving_2gpu.py
│ │ ├── test_dpsk_r1_fp4_4gpu_perf.py
│ │ ├── test_gpt_oss_4gpu_perf.py
│ │ ├── test_text_models_perf.py
│ │ ├── test_vlm_perf_5090.py
│ │ └── test_vlms_perf.py
│ ├── piecewise_cuda_graph/
│ │ └── test_piecewise_cuda_graph_support_1_gpu.py
│ ├── profiling/
│ │ ├── test_profile_v2.py
│ │ └── test_start_profile.py
│ ├── quant/
│ │ ├── test_autoround.py
│ │ ├── test_awq.py
│ │ ├── test_awq_dequant.py
│ │ ├── test_block_int8.py
│ │ ├── test_bnb.py
│ │ ├── test_deepseek_v32_fp4_4gpu.py
│ │ ├── test_deepseek_v32_fp4_mtp_4gpu.py
│ │ ├── test_deepseek_v3_fp4_4gpu.py
│ │ ├── test_eval_fp8_accuracy.py
│ │ ├── test_fp8_blockwise_gemm.py
│ │ ├── test_fp8_kernel.py
│ │ ├── test_fp8_utils.py
│ │ ├── test_fp8kv_triton.py
│ │ ├── test_fused_rms_fp8_group_quant.py
│ │ ├── test_gguf.py
│ │ ├── test_gptqmodel_dynamic.py
│ │ ├── test_int4fp8_moe.py
│ │ ├── test_int8_kernel.py
│ │ ├── test_marlin_moe.py
│ │ ├── test_modelopt_fp8.py
│ │ ├── test_nvfp4_gemm.py
│ │ ├── test_quant_config_parsing.py
│ │ ├── test_quantization.py
│ │ ├── test_torchao.py
│ │ ├── test_triton_scaled_mm.py
│ │ ├── test_w4a8_deepseek_v3.py
│ │ └── test_w8a8_quantization.py
│ ├── radix_cache/
│ │ ├── test_radix_attention.py
│ │ ├── test_radix_cache_hit.py
│ │ └── test_swa_radix_cache_kl.py
│ ├── rl/
│ │ ├── test_fp32_lm_head.py
│ │ ├── test_lora_load_from_tensor.py
│ │ ├── test_multi_instance_release_memory_occupation.py
│ │ ├── test_patch_torch.py
│ │ ├── test_release_memory_occupation.py
│ │ ├── test_return_routed_experts.py
│ │ ├── test_update_weights_from_disk.py
│ │ ├── test_update_weights_from_distributed.py
│ │ └── test_update_weights_from_tensor.py
│ ├── rotary/
│ │ ├── test_mrope.py
│ │ └── test_rope_rocm.py
│ ├── sampling/
│ │ ├── test_original_logprobs.py
│ │ ├── test_penalty.py
│ │ └── test_pytorch_sampling_backend.py
│ ├── scheduler/
│ │ ├── test_abort.py
│ │ ├── test_chunked_prefill.py
│ │ ├── test_no_chunked_prefill.py
│ │ ├── test_no_overlap_scheduler.py
│ │ ├── test_prefill_delayer.py
│ │ ├── test_priority_scheduling.py
│ │ ├── test_retract_decode.py
│ │ └── test_routing_key_scheduling.py
│ ├── sessions/
│ │ ├── test_session_control.py
│ │ ├── test_session_latency.py
│ │ └── test_streaming_session.py
│ ├── spec/
│ │ ├── eagle/
│ │ │ ├── test_deepseek_v3_fp4_mtp_small.py
│ │ │ ├── test_eagle3_basic.py
│ │ │ ├── test_eagle_constrained_decoding.py
│ │ │ ├── test_eagle_dp_attention.py
│ │ │ ├── test_eagle_infer_a.py
│ │ │ ├── test_eagle_infer_b.py
│ │ │ ├── test_eagle_infer_beta.py
│ │ │ ├── test_eagle_infer_beta_dp_attention.py
│ │ │ └── test_eagle_infer_beta_dp_attention_large.py
│ │ ├── test_constrained_decoding_spec_reasoning.py
│ │ ├── test_ngram_speculative_decoding.py
│ │ ├── test_standalone_speculative_decoding.py
│ │ └── utils/
│ │ └── test_build_eagle_tree.py
│ ├── stress/
│ │ ├── test_stress_deepseek_v3.py
│ │ ├── test_stress_glm_4_6.py
│ │ ├── test_stress_kimi_k2.py
│ │ └── test_stress_qwen3_235b.py
│ ├── test_hybrid_dp_ep_tp_mtp.py
│ ├── test_srt_backend.py
│ ├── tokenizer/
│ │ ├── test_multi_tokenizer.py
│ │ └── test_skip_tokenizer_init.py
│ ├── unit/
│ │ ├── README.md
│ │ ├── batch_invariant_ops/
│ │ │ └── test_batch_invariant_ops.py
│ │ ├── entrypoints/
│ │ │ ├── openai/
│ │ │ │ └── test_serving_embedding.py
│ │ │ └── test_ssl_cert_refresher.py
│ │ ├── function_call/
│ │ │ ├── test_function_call_parser.py
│ │ │ ├── test_glm47_moe_detector.py
│ │ │ ├── test_json_schema_constraint.py
│ │ │ ├── test_parallel_tool_calls.py
│ │ │ └── test_unknown_tool_name.py
│ │ ├── layers/
│ │ │ ├── test_conv_layer.py
│ │ │ └── test_mamba_state_scatter_triton.py
│ │ ├── managers/
│ │ │ ├── test_io_struct.py
│ │ │ ├── test_prefill_adder.py
│ │ │ └── test_profile_merger_http_api.py
│ │ ├── mem_cache/
│ │ │ ├── test_evict_policy.py
│ │ │ ├── test_mamba_unittest.py
│ │ │ ├── test_nsa_pool_host_unit.py
│ │ │ ├── test_radix_cache_slru_accuracy.py
│ │ │ ├── test_radix_cache_unit.py
│ │ │ └── test_swa_unittest.py
│ │ ├── model_executor/
│ │ │ └── test_model_hoo
Showing preview only (1,031K chars total). Download the full file or copy to clipboard to get everything.
SYMBOL INDEX (12024 symbols across 1049 files)
FILE: .github/update_ci_permission.py
function github_api_get (line 64) | def github_api_get(endpoint, params=None):
function get_write_access_users (line 103) | def get_write_access_users():
function get_top_contributors (line 120) | def get_top_contributors(days=90, limit=50):
function load_existing_permissions (line 139) | def load_existing_permissions():
function sort_permissions_file (line 149) | def sort_permissions_file():
function main (line 165) | def main():
FILE: 3rdparty/amd/tuning/benchmark_moe_rocm.py
function main (line 21) | def main(model, tp_size, dtype: str, batches):
function prune_configs (line 28) | def prune_configs(M, N, K, configs):
function union_of_list_of_dicts (line 103) | def union_of_list_of_dicts(l1, l2):
function run_grid (line 114) | def run_grid(bs, model, method, tp_size, dtype: str):
function run_timing (line 268) | def run_timing(
FILE: benchmark/asr/bench_sglang.py
function to_bytes (line 19) | def to_bytes(y, sr):
function run_asr_chat (line 26) | async def run_asr_chat(client, model_name, y, sr):
function run_asr_transcription_sync (line 55) | def run_asr_transcription_sync(client, model_name, y, sr, language=None):
function run_asr_transcription_stream_sync (line 75) | def run_asr_transcription_stream_sync(
function run_asr_transcription (line 125) | async def run_asr_transcription(
function bound_asr (line 153) | async def bound_asr(
function process_dataset (line 199) | async def process_dataset(
function run_evaluation (line 259) | def run_evaluation(args):
FILE: benchmark/bench_attention_sink/bench_attention_sink_triton.py
function benchmark_decode (line 36) | def benchmark_decode(B, S, H_Q, H_KV, D):
function benchmark_extend (line 139) | def benchmark_extend(B, S, H_Q, H_KV, D):
FILE: benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
function generate_random_string (line 18) | def generate_random_string(token_length: int) -> str:
function generate_unique_prefix (line 35) | def generate_unique_prefix(base_text, index):
function text_qa (line 40) | def text_qa(s, question, gen_len):
function prepare_prompts (line 45) | def prepare_prompts(num_prefix, num_samples_per_prefix, prefix_length, s...
function test_batch_by_batch (line 62) | def test_batch_by_batch(all_prompts, gen_len):
function test_batch_by_batch_with_hint (line 76) | def test_batch_by_batch_with_hint(all_prompts, gen_len):
function test_send_all (line 92) | def test_send_all(all_prompts, gen_len):
FILE: benchmark/bench_linear_attention/bench_gdn_decode.py
function make_inputs (line 44) | def make_inputs(
function run_baseline (line 94) | def run_baseline(inp):
function run_packed (line 132) | def run_packed(inp):
function check_correctness (line 160) | def check_correctness(B, H, HV, K, V, pool_size, device, dtype, seed=42):
function bench_shape (line 211) | def bench_shape(B, H, HV, K, V, pool_size, device, dtype):
function run_correctness (line 304) | def run_correctness(device, dtype):
function run_benchmark (line 359) | def run_benchmark(device, dtype, args):
function main (line 424) | def main():
FILE: benchmark/bench_linear_attention/bench_gdn_prefill.py
function make_k_contiguous (line 42) | def make_k_contiguous(t: torch.Tensor) -> torch.Tensor:
function gdn_flops (line 50) | def gdn_flops(
function gdn_bytes (line 68) | def gdn_bytes(
function make_inputs (line 101) | def make_inputs(
function run_triton (line 166) | def run_triton(inp):
function run_flashinfer (line 185) | def run_flashinfer(inp):
function check_shape (line 252) | def check_shape(
function bench_shape (line 355) | def bench_shape(B, H, T_per_seq, K, V, pool_size, device, dtype):
function run_correctness (line 453) | def run_correctness(device, dtype):
function run_benchmark (line 520) | def run_benchmark(device, dtype, args):
function main (line 570) | def main():
FILE: benchmark/bench_rope/benchmark_rope_index.py
class DummyVisionConfig (line 25) | class DummyVisionConfig:
class DummyHFConfig (line 30) | class DummyHFConfig:
function calculate_stats (line 42) | def calculate_stats(times: list[float]) -> dict[str, float]:
function _sync (line 54) | def _sync(device: torch.device):
function _approx_hw (line 59) | def _approx_hw(patches: int, merge: int) -> tuple[int, int]:
function generate_test_data (line 66) | def generate_test_data(
function benchmark_rope_index (line 224) | def benchmark_rope_index(
FILE: benchmark/benchmark_batch/benchmark_batch.py
function generate_random_prompt (line 30) | def generate_random_prompt(index, tokenizer_dir, num_tokens):
function prepare_all_prompts (line 43) | def prepare_all_prompts(num_requests, batch_size, num_tokens, tokenizer_...
function send_batch_request (line 75) | def send_batch_request(endpoint, prompts, gen_tokens, request_id):
function run_benchmark (line 101) | def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
function process_results (line 130) | def process_results(results, total_latency, num_requests):
function main (line 167) | def main():
FILE: benchmark/benchmark_batch/benchmark_tokenizer.py
function main (line 11) | def main():
function run_benchmark (line 65) | def run_benchmark(
function benchmark (line 86) | def benchmark(*, data, batch_size, sequential_fn, batch_fn, num_runs, ba...
function print_results (line 119) | def print_results(*, results, func_name, batch_mode):
function print_runs (line 163) | def print_runs(*, label, runs, avg):
function measure_times (line 170) | def measure_times(*, fn, num_runs):
function generate_random_token_ids (line 179) | def generate_random_token_ids(*, num_prompts, num_tokens, tokenizer):
function parse_args (line 188) | def parse_args():
FILE: benchmark/boolq/bench_sglang.py
function get_example (line 15) | def get_example(lines, i, answer):
function few_shot_examples (line 22) | def few_shot_examples(lines, k):
function main (line 29) | def main(args):
FILE: benchmark/boolq/convert_parquet_to_json.py
function convert_parquet_to_json (line 6) | def convert_parquet_to_json(input_file, output_file):
FILE: benchmark/ceval/bench_sglang.py
function get_one_example (line 20) | def get_one_example(line, include_answer):
function get_few_shot_examples (line 32) | def get_few_shot_examples(lines):
function get_answer_value (line 39) | def get_answer_value(response):
function main (line 49) | def main(args):
FILE: benchmark/dspy/bench_dspy_intro.py
class BasicQA (line 12) | class BasicQA(dspy.Signature):
class GenerateAnswer (line 19) | class GenerateAnswer(dspy.Signature):
class RAG (line 27) | class RAG(dspy.Module):
method __init__ (line 28) | def __init__(self, num_passages=3):
method forward (line 34) | def forward(self, question):
function main (line 40) | def main(args):
FILE: benchmark/fla/benchmark_layernorm_gated.py
function benchmark_layer_norm_fwd (line 15) | def benchmark_layer_norm_fwd(
function main (line 264) | def main():
FILE: benchmark/generative_agents/agent_functions.py
function poignancy_event (line 8) | def poignancy_event(s, persona_name, persona_iss, event):
function poignancy_event_prompt (line 18) | def poignancy_event_prompt(persona_name, persona_iss, event):
function generate_event_triple (line 31) | def generate_event_triple(s, persona_name, action):
function generate_event_triple_prompt (line 56) | def generate_event_triple_prompt(persona_name, action):
function generate_pronunciatio (line 83) | def generate_pronunciatio(s, action):
function generate_pronunciatio_prompt (line 89) | def generate_pronunciatio_prompt(action):
function action_location_sector (line 98) | def action_location_sector(
function action_location_sector_prompt (line 158) | def action_location_sector_prompt(
function action_location_object (line 219) | def action_location_object(
function action_location_object_prompt (line 260) | def action_location_object_prompt(
FILE: benchmark/generative_agents/bench_other.py
function main (line 18) | def main(args):
FILE: benchmark/generative_agents/bench_sglang.py
function main (line 21) | def main(args):
FILE: benchmark/gsm8k/bench_other.py
function get_one_example (line 19) | def get_one_example(lines, i, include_answer):
function get_few_shot_examples (line 26) | def get_few_shot_examples(lines, k):
function get_answer_value (line 33) | def get_answer_value(answer_str):
function main (line 44) | def main(args):
FILE: benchmark/gsm8k/bench_sglang.py
function get_one_example (line 22) | def get_one_example(lines, i, include_answer):
function get_few_shot_examples (line 29) | def get_few_shot_examples(lines, k):
function get_answer_value (line 36) | def get_answer_value(answer_str):
function main (line 47) | def main(args):
FILE: benchmark/hellaswag/bench_other.py
function get_one_example (line 14) | def get_one_example(lines, i, include_answer):
function get_few_shot_examples (line 21) | def get_few_shot_examples(lines, k):
function main (line 28) | def main(args):
FILE: benchmark/hellaswag/bench_sglang.py
function get_one_example (line 16) | def get_one_example(lines, i, include_answer):
function get_few_shot_examples (line 23) | def get_few_shot_examples(lines, k):
function main (line 30) | def main(args):
FILE: benchmark/hf3fs/bench_client.py
function print_stats (line 13) | def print_stats(x: List[int]):
function test (line 26) | def test():
function bench (line 66) | def bench():
function main (line 155) | def main():
FILE: benchmark/hf3fs/bench_storage.py
function print_stats (line 17) | def print_stats(x: List[int]):
function test (line 30) | def test():
function bench (line 129) | def bench():
function allclose (line 195) | def allclose():
function main (line 250) | def main():
FILE: benchmark/hicache/bench_long_context.py
class ContextWorkloadGenerator (line 18) | class ContextWorkloadGenerator(WorkloadGenerator):
method __init__ (line 19) | def __init__(self, args):
method response_handler (line 68) | def response_handler(self):
FILE: benchmark/hicache/bench_mix.py
function write_debug_log (line 28) | def write_debug_log(data):
function parse_args (line 42) | def parse_args():
function load_config (line 92) | def load_config():
class UserData (line 125) | class UserData:
function synchronized (line 134) | def synchronized():
class UserGenerator (line 146) | class UserGenerator:
method __init__ (line 147) | def __init__(self, config, model_path, dataset_path):
method gen (line 195) | def gen(self):
method push (line 230) | def push(self, user_data, generated_text, len_itl):
method pop (line 270) | def pop(self):
function gen_payload (line 279) | def gen_payload(prompt, output_len):
function async_request_sglang_generate (line 299) | async def async_request_sglang_generate(
class AtomicCounter (line 371) | class AtomicCounter:
method __init__ (line 372) | def __init__(self, initial_value=0):
method increment (line 377) | def increment(self, amount=1):
method get (line 381) | def get(self):
class WorkloadGenerator (line 385) | class WorkloadGenerator:
method __init__ (line 386) | def __init__(self, args):
method handle_request (line 417) | async def handle_request(self, user_data):
method request_sender (line 427) | def request_sender(self):
method response_handler (line 448) | def response_handler(self):
method run (line 475) | def run(self):
function main (line 544) | def main():
FILE: benchmark/hicache/bench_multiturn.py
function parse_args (line 25) | def parse_args():
function log_to_jsonl_file (line 171) | def log_to_jsonl_file(data, file_path="performance_metrics.jsonl", tag=""):
class ReadyQueue (line 183) | class ReadyQueue:
method __init__ (line 188) | def __init__(self, init_requests=None, policy="random"):
method append (line 193) | def append(self, item):
method pop (line 197) | def pop(self):
class WorkloadGenerator (line 211) | class WorkloadGenerator:
method __init__ (line 212) | def __init__(self, args):
method handle_request (line 388) | async def handle_request(self, item):
method request_sender (line 402) | def request_sender(self):
method response_handler (line 435) | def response_handler(self):
method run (line 544) | def run(self):
FILE: benchmark/hicache/bench_serving.py
class RequestFuncInput (line 43) | class RequestFuncInput:
class RequestFuncOutput (line 56) | class RequestFuncOutput:
function async_request_openai_completions (line 69) | async def async_request_openai_completions(
function async_request_profile (line 202) | async def async_request_profile(api_url: str) -> RequestFuncOutput:
class BenchmarkMetrics (line 228) | class BenchmarkMetrics:
function get_requests (line 261) | async def get_requests(
function calculate_metrics (line 284) | def calculate_metrics(
function benchmark (line 372) | async def benchmark(
function run_benchmark (line 685) | def run_benchmark(args_: argparse.Namespace):
FILE: benchmark/hicache/data_processing.py
function common_filter_chat (line 37) | def common_filter_chat(
function sample_sharegpt_requests (line 94) | def sample_sharegpt_requests(
function sample_ultrachat_requests (line 150) | def sample_ultrachat_requests(
function sample_loogle_requests (line 198) | def sample_loogle_requests(
function sample_nextqa_requests (line 267) | def sample_nextqa_requests(
function sample_random_requests (line 347) | def sample_random_requests(
function sample_generated_shared_prefix_requests (line 434) | def sample_generated_shared_prefix_requests(
function get_dataset (line 520) | def get_dataset(args, tokenizer):
FILE: benchmark/hicache/nextqa.py
function find_video_files (line 9) | def find_video_files(video_dir) -> List[str]:
function video_frames (line 24) | def video_frames(video_path, max_frames) -> int:
class Video (line 30) | class Video:
method __init__ (line 31) | def __init__(self, video_path, num_frames):
method __str__ (line 35) | def __str__(self):
method __iter__ (line 38) | def __iter__(self):
class VideoPrompt (line 42) | class VideoPrompt(Video):
method __init__ (line 43) | def __init__(self, video_path, num_frames, prompt):
method __str__ (line 47) | def __str__(self):
method __iter__ (line 50) | def __iter__(self):
class VideoLoader (line 54) | class VideoLoader:
class VideoFileLoader (line 58) | class VideoFileLoader(VideoLoader):
method __init__ (line 63) | def __init__(self, video_dir, batch_size=1, max_frames=sys.maxsize):
method __iter__ (line 71) | def __iter__(self): # (file, number of frames)
class NExTQALoader (line 85) | class NExTQALoader(VideoLoader):
method __init__ (line 91) | def __init__(
method get_video_prompt (line 114) | def get_video_prompt(self, entry, max_frames) -> VideoPrompt:
method __iter__ (line 126) | def __iter__(self):
FILE: benchmark/hicache/perf.py
function jit_hicache_impl (line 8) | def jit_hicache_impl(
function ref_hicache_impl (line 33) | def ref_hicache_impl(
class HicacheBenchArgs (line 57) | class HicacheBenchArgs(NamedTuple):
function perf (line 63) | def perf(f: Callable[[], Any], loop: int = 100) -> float:
function test_hicache_kernel (line 79) | def test_hicache_kernel(args: HicacheBenchArgs) -> None:
function main (line 210) | def main() -> None:
FILE: benchmark/json_decode_regex/bench_other.py
function json_decode (line 17) | def json_decode(document, generate):
function main (line 38) | def main(args):
FILE: benchmark/json_decode_regex/bench_sglang.py
function json_warm_up (line 17) | def json_warm_up(s):
function json_decode (line 32) | def json_decode(s, document):
function main (line 47) | def main(args):
FILE: benchmark/json_decode_regex/build_dataset.py
function get_content (line 35) | def get_content(city_name):
FILE: benchmark/json_jump_forward/bench_other.py
function character_gen (line 44) | def character_gen(name, generate):
function city_gen (line 51) | def city_gen(document, generate):
function character_maker (line 61) | def character_maker(lm, name):
function call_generate_lmql (line 85) | async def call_generate_lmql(
function city_maker (line 109) | def city_maker(lm, document):
function bench_character (line 132) | def bench_character(args):
function bench_city_doc (line 210) | def bench_city_doc(args):
function main (line 254) | def main(args):
FILE: benchmark/json_jump_forward/bench_sglang.py
function character_gen (line 44) | def character_gen(s, name):
function city_gen (line 51) | def city_gen(s, document):
function bench_city_doc (line 59) | def bench_city_doc(args):
function bench_character (line 82) | def bench_character(args):
function main (line 106) | def main(args):
FILE: benchmark/json_jump_forward/build_dataset.py
function get_content (line 35) | def get_content(city_name):
FILE: benchmark/json_schema/bench_sglang.py
function schema_gen (line 20) | def schema_gen(s, message: Tuple[str, str], json_schema: str):
function contains_formats (line 29) | def contains_formats(schema, formats: List[str]):
function convert_dataset (line 43) | def convert_dataset(path: str):
function bench_schema (line 76) | def bench_schema(args):
function main (line 111) | def main(args):
FILE: benchmark/kernels/all_reduce/benchmark_aiter.py
function parse_args (line 19) | def parse_args():
function get_env_rank_world (line 55) | def get_env_rank_world() -> Tuple[int, int, int]:
function init_dist (line 62) | def init_dist(backend: str):
function get_device (line 73) | def get_device(local_rank: int) -> torch.device:
function human_size (line 78) | def human_size(num_bytes: int) -> str:
function get_message_sizes (line 87) | def get_message_sizes() -> List[int]:
function run_once (line 105) | def run_once(comm, inp: torch.Tensor) -> Optional[torch.Tensor]:
function bench_impl (line 114) | def bench_impl(
function main (line 201) | def main():
FILE: benchmark/kernels/all_reduce/benchmark_all_reduce.py
function parse_args (line 26) | def parse_args():
function get_env_rank_world (line 62) | def get_env_rank_world() -> Tuple[int, int, int]:
function init_dist (line 69) | def init_dist(backend: str):
function get_device (line 92) | def get_device(local_rank: int) -> torch.device:
function human_size (line 97) | def human_size(num_bytes: int) -> str:
function get_message_sizes (line 106) | def get_message_sizes() -> List[int]:
function run_once (line 124) | def run_once(comm, inp: torch.Tensor) -> Optional[torch.Tensor]:
function bench_impl (line 133) | def bench_impl(
function main (line 220) | def main():
FILE: benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py
function parse_shapes (line 48) | def parse_shapes(raw: str) -> List[Shape]:
function dtype_from_name (line 64) | def dtype_from_name(name: str) -> torch.dtype:
function check_close (line 76) | def check_close(
function _measure_us (line 92) | def _measure_us(
function _barrier (line 128) | def _barrier(device: torch.device):
function _mean_across_ranks (line 135) | def _mean_across_ranks(value: float, device: torch.device) -> float:
function _all_true_across_ranks (line 142) | def _all_true_across_ranks(value: bool, device: torch.device) -> bool:
function _make_inputs (line 148) | def _make_inputs(
function _split_reference (line 171) | def _split_reference(
function bench_eager (line 185) | def bench_eager(
function bench_graph (line 234) | def bench_graph(
function _shape_bytes (line 313) | def _shape_bytes(shape: Shape, dtype: torch.dtype) -> int:
function parse_args (line 318) | def parse_args():
function main (line 367) | def main():
FILE: benchmark/kernels/all_reduce/benchmark_mscclpp.py
function torch_allreduce (line 34) | def torch_allreduce(torch_input: torch.Tensor, group: ProcessGroup) -> t...
function msccl_allreduce (line 39) | def msccl_allreduce(
function pynccl_allreduce (line 45) | def pynccl_allreduce(
function _bench_graph_time (line 52) | def _bench_graph_time(func, inp_randn, warmup_loop=2, graph_loop=10, tes...
function _bench_eager_time (line 84) | def _bench_eager_time(func, inp_randn, warmup_loop=2, test_loop=10):
function get_torch_prof_ctx (line 106) | def get_torch_prof_ctx(do_prof: bool):
function human_readable_size (line 122) | def human_readable_size(size, decimal_places=1):
function print_markdown_table (line 137) | def print_markdown_table(data):
FILE: benchmark/kernels/all_reduce/benchmark_torch_symm_mem.py
function torch_allreduce (line 55) | def torch_allreduce(torch_input: torch.Tensor, group: ProcessGroup) -> t...
function torch_symm_mem_allreduce (line 60) | def torch_symm_mem_allreduce(
function pynccl_allreduce (line 66) | def pynccl_allreduce(
function _bench_graph_time (line 73) | def _bench_graph_time(func, inp_randn, warmup_loop=2, graph_loop=10, tes...
function _bench_eager_time (line 105) | def _bench_eager_time(func, inp_randn, warmup_loop=2, test_loop=10):
function get_torch_prof_ctx (line 127) | def get_torch_prof_ctx(do_prof: bool):
function human_readable_size (line 143) | def human_readable_size(size, decimal_places=1):
function print_markdown_table (line 158) | def print_markdown_table(data):
FILE: benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py
function benchmark_forward (line 13) | def benchmark_forward(
function time_fwd (line 34) | def time_fwd(func, *args, **kwargs):
function decode_attention_sglang (line 39) | def decode_attention_sglang(
function decode_attention_flashinfer (line 98) | def decode_attention_flashinfer(dtype, head_num_q, head_num_kv):
function convert_to_cudnn_type (line 159) | def convert_to_cudnn_type(torch_type):
function decode_attention_cudnn (line 174) | def decode_attention_cudnn(
function calculate_diff (line 287) | def calculate_diff():
FILE: benchmark/kernels/deepep/deepep_utils.py
function init_dist (line 12) | def init_dist(local_rank: int, num_local_ranks: int, args):
function calc_diff (line 36) | def calc_diff(x: torch.Tensor, y: torch.Tensor):
function per_token_cast_to_fp8 (line 43) | def per_token_cast_to_fp8(x: torch.Tensor):
function per_token_cast_back (line 53) | def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
function inplace_unique (line 59) | def inplace_unique(x: torch.Tensor, num_slots: int):
function create_grouped_scores (line 74) | def create_grouped_scores(
function bench (line 84) | def bench(fn, num_warmups: int = 20, num_tests: int = 30, post_fn=None):
class empty_suppress (line 114) | class empty_suppress:
method __enter__ (line 115) | def __enter__(self):
method __exit__ (line 118) | def __exit__(self, *_):
class suppress_stdout_stderr (line 122) | class suppress_stdout_stderr:
method __enter__ (line 123) | def __enter__(self):
method __exit__ (line 143) | def __exit__(self, *_):
function bench_kineto (line 157) | def bench_kineto(
function hash_tensor (line 217) | def hash_tensor(t: torch.Tensor):
FILE: benchmark/kernels/deepep/tuning_deepep.py
function test_main (line 30) | def test_main(
function _write_output (line 421) | def _write_output(args, output_data):
function test_loop (line 429) | def test_loop(local_rank: int, num_local_ranks: int, args):
FILE: benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py
function tl_gemm (line 21) | def tl_gemm(
function per_token_cast_to_fp8 (line 98) | def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch....
function per_block_cast_to_fp8 (line 108) | def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch....
function fp8_gemm_deepgemm (line 123) | def fp8_gemm_deepgemm(
function fp8_gemm_sglang (line 140) | def fp8_gemm_sglang(
function fp8_gemm_vllm (line 159) | def fp8_gemm_vllm(
function calculate_diff (line 178) | def calculate_diff(m: int, n: int, k: int):
function get_weight_shapes (line 236) | def get_weight_shapes(tp_size):
function create_benchmark_configs (line 269) | def create_benchmark_configs(tp_size):
function get_benchmark (line 281) | def get_benchmark(tp_size):
FILE: benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm_blackwell.py
function per_block_cast_to_fp8 (line 19) | def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch....
function get_weight_shapes (line 35) | def get_weight_shapes(tp_size):
function create_benchmark_configs (line 68) | def create_benchmark_configs(tp_size):
function fp8_gemm_flashinfer (line 80) | def fp8_gemm_flashinfer(
function fp8_gemm_deepgemm_blackwell (line 98) | def fp8_gemm_deepgemm_blackwell(
function check_accuracy (line 112) | def check_accuracy(a, b, atol, rtol, percent):
function calculate_diff (line 136) | def calculate_diff(m: int, n: int, k: int):
function _benchmark (line 178) | def _benchmark(m, n, k, tp_size, provider):
function get_benchmark_plot_friendly (line 231) | def get_benchmark_plot_friendly(tp_size):
function get_benchmark (line 256) | def get_benchmark(tp_size):
FILE: benchmark/kernels/deepseek/benchmark_deepgemm_fp8_group_gemm.py
function construct_grouped_and_flat_fp8 (line 18) | def construct_grouped_and_flat_fp8(
function fp8_gemm_group_triton_kernel (line 87) | def fp8_gemm_group_triton_kernel(
function fp8_gemm_group_triton (line 183) | def fp8_gemm_group_triton(a_tuple, b_tuple, c, num_groups):
function fp8_gemm_group_deepgemm (line 244) | def fp8_gemm_group_deepgemm(x_fp8_grouped, y_fp8_grouped, out, m_indices):
function calculate_diff (line 254) | def calculate_diff(m: int, n: int, k: int, num_groups: int):
function get_weight_shapes (line 321) | def get_weight_shapes(tp_size):
function create_benchmark_configs (line 354) | def create_benchmark_configs(tp_size):
function get_benchmark (line 367) | def get_benchmark(tp_size):
FILE: benchmark/kernels/elementwise/benchmark_concat_mla.py
function create_data (line 15) | def create_data(num_tokens):
function fn_torch (line 36) | def fn_torch(k, k_nope, k_rope):
function fn_hack_non_strided (line 41) | def fn_hack_non_strided(k, k_nope, k_rope):
function fn_torch_compiled (line 50) | def fn_torch_compiled(k, k_nope, k_rope):
function fn_cuda (line 54) | def fn_cuda(k, k_nope, k_rope):
function fn_triton_kernel (line 59) | def fn_triton_kernel(
function fn_triton (line 109) | def fn_triton(k, k_nope, k_rope):
function execute_and_get_output (line 130) | def execute_and_get_output(f, data):
function benchmark (line 182) | def benchmark(num_tokens, provider):
FILE: benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py
function setup_flashinfer_workspace (line 85) | def setup_flashinfer_workspace(
function cleanup_flashinfer_workspace (line 122) | def cleanup_flashinfer_workspace(ipc_handles):
class FlashInferFusedAllReduceParams (line 134) | class FlashInferFusedAllReduceParams:
method __init__ (line 137) | def __init__(
method get_trtllm_fused_allreduce_kwargs (line 152) | def get_trtllm_fused_allreduce_kwargs(self):
function flashinfer_fused_allreduce_rmsnorm (line 162) | def flashinfer_fused_allreduce_rmsnorm(
function flashinfer_fused_allreduce_rmsnorm_fp8_quant (line 202) | def flashinfer_fused_allreduce_rmsnorm_fp8_quant(
function flashinfer_fused_allreduce_rmsnorm_fp4_quant (line 244) | def flashinfer_fused_allreduce_rmsnorm_fp4_quant(
function standard_allreduce_rmsnorm (line 287) | def standard_allreduce_rmsnorm(
function standard_allreduce_rmsnorm_fp8_quant (line 316) | def standard_allreduce_rmsnorm_fp8_quant(
function standard_allreduce_rmsnorm_fp4_quant (line 353) | def standard_allreduce_rmsnorm_fp4_quant(
function standard_allreduce_rmsnorm_native (line 397) | def standard_allreduce_rmsnorm_native(
function standard_allreduce_rmsnorm_fp8_quant_native (line 415) | def standard_allreduce_rmsnorm_fp8_quant_native(
function standard_allreduce_rmsnorm_fp4_quant_native (line 443) | def standard_allreduce_rmsnorm_fp4_quant_native(
function standard_allreduce_rmsnorm_native_compiled (line 478) | def standard_allreduce_rmsnorm_native_compiled(
function standard_allreduce_rmsnorm_fp8_quant_native_compiled (line 491) | def standard_allreduce_rmsnorm_fp8_quant_native_compiled(
function standard_allreduce_rmsnorm_fp4_quant_native_compiled (line 511) | def standard_allreduce_rmsnorm_fp4_quant_native_compiled(
function create_test_tensors (line 532) | def create_test_tensors(
function benchmark_operation (line 566) | def benchmark_operation(
function run_benchmarks (line 605) | def run_benchmarks(
function prepare_results_with_speedups (line 894) | def prepare_results_with_speedups(results_dict):
function print_results (line 993) | def print_results(results_dict, seq_len, hidden_dim, dtype, use_residual...
function format_results_markdown (line 1019) | def format_results_markdown(
function save_results_to_file (line 1064) | def save_results_to_file(
function main (line 1087) | def main():
FILE: benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py
function fused_moe_triton_api (line 31) | def fused_moe_triton_api(
function fused_moe_sglang_api (line 61) | def fused_moe_sglang_api(
function benchmark (line 115) | def benchmark(
function main (line 190) | def main():
FILE: benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
function get_model_config (line 16) | def get_model_config(model_name: str, tp_size: int):
function fused_topk_native (line 77) | def fused_topk_native(
function fused_moe_torch (line 97) | def fused_moe_torch(
function fused_moe_torch_compile (line 127) | def fused_moe_torch_compile(
function fused_moe_sglang_api (line 153) | def fused_moe_sglang_api(
function benchmark (line 203) | def benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
function main (line 281) | def main():
FILE: benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
function fused_moe_vllm_api (line 22) | def fused_moe_vllm_api(
function fused_moe_sglang_api (line 68) | def fused_moe_sglang_api(
function benchmark (line 120) | def benchmark(batch_size, provider, model_config, use_fp8_w8a8=False):
function main (line 214) | def main():
FILE: benchmark/kernels/fused_moe_triton/common_utils.py
class BenchmarkConfig (line 14) | class BenchmarkConfig(TypedDict):
function calculate_shard_intermediate_size (line 23) | def calculate_shard_intermediate_size(
function get_model_config (line 32) | def get_model_config(
function get_rocm_configs_compute_bound (line 161) | def get_rocm_configs_compute_bound() -> List[Dict[str, int]]:
function get_configs_compute_bound (line 184) | def get_configs_compute_bound() -> List[Dict[str, int]]:
function sort_config (line 208) | def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
function save_configs (line 223) | def save_configs(
function get_config_filename (line 233) | def get_config_filename(
function get_default_batch_sizes (line 271) | def get_default_batch_sizes() -> List[int]:
FILE: benchmark/kernels/fused_moe_triton/tuning_client.py
function read_long_prompt (line 21) | def read_long_prompt():
function openai_stream_test (line 31) | def openai_stream_test(model, ip, port):
FILE: benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
function benchmark_config (line 40) | def benchmark_config(
class BenchmarkWorker (line 236) | class BenchmarkWorker:
method __init__ (line 238) | def __init__(self, seed: int, server_args: ServerArgs) -> None:
method benchmark (line 247) | def benchmark(
method tune (line 315) | def tune(
function main (line 365) | def main(args: argparse.Namespace):
FILE: benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py
class MoeInputs (line 45) | class MoeInputs:
class KernelWrapper (line 52) | class KernelWrapper:
method __init__ (line 53) | def __init__(self, moe_inputs, use_cuda_graph=True, inner_iter=10, **k...
method cuda_graph_wrapper (line 64) | def cuda_graph_wrapper(self):
method forward_cost (line 95) | def forward_cost(self, try_cnt=2):
function load_topk_ids (line 119) | def load_topk_ids(topk_ids_dir, i: int):
function benchmark_config (line 128) | def benchmark_config(
class BestConfigTrace (line 408) | class BestConfigTrace:
method __init__ (line 409) | def __init__(self, name, down_moe=False):
method update (line 414) | def update(self, config, time_cost_all):
method time_cost (line 426) | def time_cost(self, block_m):
method config_dict (line 432) | def config_dict(self, block_m):
class BenchmarkWorker (line 445) | class BenchmarkWorker:
method __init__ (line 447) | def __init__(self, seed: int, server_args: ServerArgs) -> None:
method benchmark (line 456) | def benchmark(
method tune (line 494) | def tune(
method cmp_configs (line 563) | def cmp_configs(
function save_configs_sep (line 621) | def save_configs_sep(
function main (line 659) | def main(args: argparse.Namespace):
FILE: benchmark/kernels/quantization/bench_fp4_quant.py
function _test_accuracy_once (line 17) | def _test_accuracy_once(E, M, K, input_dtype, device):
function benchmark (line 53) | def benchmark(M, K, provider):
function test_accuracy (line 114) | def test_accuracy():
FILE: benchmark/kernels/quantization/bench_int8_quant.py
function torch_int8_quant (line 12) | def torch_int8_quant(x):
function _test_accuracy_once (line 23) | def _test_accuracy_once(M, K, input_dtype, device):
function test_accuracy (line 35) | def test_accuracy():
function benchmark (line 59) | def benchmark(batch_size, provider):
FILE: benchmark/kernels/quantization/tuning_block_wise_kernel.py
function w8a8_block_matmul (line 46) | def w8a8_block_matmul(
function get_rocm_configs_compute_bound (line 138) | def get_rocm_configs_compute_bound():
function get_configs_compute_bound (line 161) | def get_configs_compute_bound():
function get_weight_shapes (line 185) | def get_weight_shapes(tp_size):
function benchmark_config (line 218) | def benchmark_config(
function tune (line 245) | def tune(M, N, K, block_size, out_dtype, search_space, input_type):
function save_configs (line 318) | def save_configs(
function get_available_gpu_count (line 354) | def get_available_gpu_count():
function tune_on_gpu (line 359) | def tune_on_gpu(args_dict):
function distribute_batch_sizes (line 405) | def distribute_batch_sizes(batch_sizes, num_gpus):
function main (line 415) | def main(args):
FILE: benchmark/kernels/scheduler_batch/benchmark_get_last_loc_triton.py
function get_last_loc_torch (line 11) | def get_last_loc_torch(
function get_last_loc_kernel (line 24) | def get_last_loc_kernel(
function get_last_loc_triton (line 47) | def get_last_loc_triton(
function test_get_last_loc (line 69) | def test_get_last_loc():
function get_benchmark (line 94) | def get_benchmark():
function run_benchmark (line 144) | def run_benchmark(save_path: str = "./configs/benchmark_ops/get_last_loc...
FILE: benchmark/kernels/scheduler_batch/benchmark_write_req_to_token_pool_triton.py
function write_req_to_token_pool_triton (line 12) | def write_req_to_token_pool_triton(
function write_req_to_token_pool_triton_optimize (line 49) | def write_req_to_token_pool_triton_optimize(
function write_req_to_token_pool_reference (line 91) | def write_req_to_token_pool_reference(
function test_write_req_to_token_pool (line 114) | def test_write_req_to_token_pool():
function get_benchmark (line 231) | def get_benchmark():
function run_benchmark (line 315) | def run_benchmark(save_path: str = "./configs/benchmark_ops/write_req_to...
FILE: benchmark/kernels/sliding_window_attention_triton/bench_triton_swa_kernel.py
function extend_attention_fwd_torch (line 11) | def extend_attention_fwd_torch(
function _build_batch (line 85) | def _build_batch(
function _run_triton (line 177) | def _run_triton(inputs):
function _run_torch_ref (line 196) | def _run_torch_ref(inputs):
function bench (line 241) | def bench(
FILE: benchmark/line_retrieval/bench_sglang.py
function line_retrieval (line 17) | def line_retrieval(s, prefix, suffix, body_0, body_1, body_2, body_3):
function eval_model (line 30) | def eval_model(args, line_obj, num_hoops, src_indices, dst_percents):
function main (line 131) | def main(args):
FILE: benchmark/line_retrieval/gen_data.py
function generate_lines (line 16) | def generate_lines(random_words, num_lines, redirect_ratio):
FILE: benchmark/llava_bench/bench_sglang.py
function image_qa (line 17) | def image_qa(s, image_file, question):
function main (line 22) | def main(args):
FILE: benchmark/llm_judge/bench_other.py
function multi_dimension_judge (line 24) | def multi_dimension_judge(article, generate):
function multi_dimension_judge_async (line 52) | async def multi_dimension_judge_async(article, generate):
function main (line 80) | def main(args):
FILE: benchmark/llm_judge/bench_sglang.py
function multi_dimension_judge (line 25) | def multi_dimension_judge(s, article):
function main (line 54) | def main(args):
FILE: benchmark/long_json_decode/bench_other.py
function json_decode (line 13) | def json_decode(document, generate):
function main (line 30) | def main(args):
FILE: benchmark/long_json_decode/bench_sglang.py
function json_decode (line 14) | def json_decode(s, document):
function main (line 34) | def main(args):
FILE: benchmark/lora/launch_server.py
function launch_server (line 11) | def launch_server(args):
FILE: benchmark/lora/lora_bench.py
function async_request_openai_completions (line 46) | async def async_request_openai_completions(
function benchmark (line 147) | async def benchmark(
function run_benchmark (line 343) | def run_benchmark(args_: argparse.Namespace):
function set_ulimit (line 401) | def set_ulimit(target_soft_limit=65535):
FILE: benchmark/mmlu/bench_other.py
function format_subject (line 20) | def format_subject(subject):
function format_example (line 28) | def format_example(df, idx, include_answer=True):
function gen_prompt (line 39) | def gen_prompt(train_df, subject, k=-1):
function evaluate (line 50) | def evaluate(args, subject, dev_df, test_df, call_generate):
function main (line 115) | def main(args):
FILE: benchmark/mmlu/bench_sglang.py
function format_subject (line 25) | def format_subject(subject):
function format_example (line 33) | def format_example(df, idx, include_answer=True):
function gen_prompt (line 44) | def gen_prompt(train_df, subject, k=-1):
function download_data (line 55) | def download_data(data_dir):
function main (line 77) | def main(args):
FILE: benchmark/mmmu/bench_hf.py
function eval_mmmu (line 18) | def eval_mmmu(args):
FILE: benchmark/mmmu/bench_sglang.py
class RequestFuncOutput (line 42) | class RequestFuncOutput:
function async_request_profile (line 54) | async def async_request_profile(api_url: str) -> RequestFuncOutput:
function _get_prefix_suffix (line 72) | def _get_prefix_suffix(prompt: str) -> Tuple[str, str]:
function process_sample (line 79) | async def process_sample(
function process_sample_with_semaphore (line 127) | async def process_sample_with_semaphore(
function eval_mmmu (line 143) | async def eval_mmmu(args) -> None:
function parse_args (line 236) | def parse_args():
function main (line 249) | def main():
FILE: benchmark/mmmu/data_utils.py
function get_multi_choice_info (line 78) | def get_multi_choice_info(options):
function load_yaml (line 94) | def load_yaml(file_path):
function parse_img_path (line 104) | def parse_img_path(text):
function process_single_sample (line 109) | def process_single_sample(data):
function save_json (line 138) | def save_json(filename, ds):
function save_jsonl (line 145) | def save_jsonl(filename, data):
function save_args (line 163) | def save_args(args, path_dir):
function construct_prompt (line 173) | def construct_prompt(sample, config):
FILE: benchmark/mmmu/eval_utils.py
class EvalArgs (line 28) | class EvalArgs:
method add_cli_args (line 46) | def add_cli_args(parser: argparse.ArgumentParser):
method from_cli_args (line 133) | def from_cli_args(cls, args: argparse.Namespace):
function set_seed (line 138) | def set_seed(seed_value):
function prepare_samples (line 154) | def prepare_samples(eval_args: EvalArgs):
function get_sampling_params (line 258) | def get_sampling_params(eval_args):
function parse_multi_choice_response (line 276) | def parse_multi_choice_response(response, all_choices, index2ans):
function check_is_number (line 332) | def check_is_number(string):
function normalize_str (line 344) | def normalize_str(string):
function extract_numbers (line 369) | def extract_numbers(string):
function parse_open_response (line 392) | def parse_open_response(response):
function eval_multi_choice (line 468) | def eval_multi_choice(gold_i, pred_i):
function eval_open (line 497) | def eval_open(gold_i, pred_i):
function evaluate (line 526) | def evaluate(samples):
function calculate_ins_level_acc (line 553) | def calculate_ins_level_acc(results: Dict):
function process_result (line 565) | def process_result(response, sample, answer_dict, out_samples):
function eval_result (line 589) | def eval_result(model_answer_path, answer_dict, eval_output_path=None):
FILE: benchmark/mtbench/bench_other.py
function load_questions (line 15) | def load_questions(filename):
function write_answers (line 24) | def write_answers(filename, model_id, questions, answers):
function main (line 40) | def main(args):
FILE: benchmark/mtbench/bench_sglang.py
function load_questions (line 15) | def load_questions(filename):
function write_answers (line 24) | def write_answers(filename, model_id, questions, answers):
function answer_mt_bench (line 41) | def answer_mt_bench(s, question_1, question_2):
function main (line 49) | def main(args):
FILE: benchmark/mtbench/bench_sglang_eagle.py
function load_questions (line 24) | def load_questions(filename):
function write_answers (line 33) | def write_answers(filename, model_id, questions, answers):
function answer_mt_bench (line 50) | def answer_mt_bench(s, question_1, question_2):
function main (line 60) | def main(args):
FILE: benchmark/multi_chain_reasoning/bench_other.py
function get_answer_value (line 18) | def get_answer_value(answer_str):
function multi_chain_gsm8k (line 39) | def multi_chain_gsm8k(question, num_chains, call_generate):
function multi_chain_gsm8k_async (line 65) | async def multi_chain_gsm8k_async(question, num_chains, call_generate):
function main (line 91) | def main(args):
FILE: benchmark/multi_chain_reasoning/bench_sglang.py
function get_answer_value (line 18) | def get_answer_value(answer_str):
function main (line 39) | def main(args):
FILE: benchmark/multi_document_qa/bench_other.py
function multi_document_qa (line 18) | def multi_document_qa(docs, question, generate):
function main (line 37) | def main(args):
FILE: benchmark/multi_document_qa/bench_sglang.py
function multi_document_qa (line 14) | def multi_document_qa(s, docs, question):
function main (line 33) | def main(args):
FILE: benchmark/multi_turn_chat/bench_other.py
function multi_turns (line 15) | def multi_turns(generate, qas):
function main (line 24) | def main(args):
FILE: benchmark/multi_turn_chat/bench_sglang.py
function multi_turns (line 17) | def multi_turns(s, qas):
function main (line 23) | def main(args):
FILE: benchmark/multi_turn_chat/data_gen.py
function gen_prompt (line 7) | def gen_prompt(tokenizer, token_num):
function gen_arguments (line 15) | def gen_arguments(args, tokenizer):
FILE: benchmark/multi_turn_chat/long_prompt_multi_turn.py
function gen_prompt (line 18) | def gen_prompt(tokenizer, token_num):
function get_cache_path (line 25) | def get_cache_path(args):
function gen_arguments (line 34) | def gen_arguments(args, tokenizer):
function multi_turns (line 72) | def multi_turns(s, system_prompt, qas):
function main (line 80) | def main(args):
FILE: benchmark/prefill_only/bench_embeddings.py
function build_embeddings_request (line 81) | def build_embeddings_request(index: int, item_count: int) -> tuple:
function validate_embeddings_response (line 100) | def validate_embeddings_response(response_data: dict) -> bool:
function build_warmup_embeddings_request (line 110) | def build_warmup_embeddings_request() -> dict:
function run_benchmark (line 122) | async def run_benchmark(rps, duration_secs, item_count):
function main (line 137) | async def main():
FILE: benchmark/prefill_only/bench_score.py
function create_score_request_builder (line 65) | def create_score_request_builder():
function validate_score_response (line 115) | def validate_score_response(response_data: dict) -> bool:
function build_warmup_score_request (line 120) | def build_warmup_score_request() -> dict:
function run_benchmark (line 155) | async def run_benchmark(rps, duration_secs, item_count):
function main (line 173) | async def main():
FILE: benchmark/prefill_only/util.py
class BenchmarkConfig (line 22) | class BenchmarkConfig:
method __init__ (line 25) | def __init__(self):
function generate_text_with_token_count (line 46) | def generate_text_with_token_count(
function setup_profiler (line 87) | def setup_profiler(config: BenchmarkConfig, benchmark_name: str) -> None:
function prepare_all_requests_parallel (line 106) | def prepare_all_requests_parallel(
function sleep_with_distribution (line 187) | async def sleep_with_distribution(distribution: str, rps: float) -> None:
function build_http_request_json (line 208) | def build_http_request_json(request_data: Any) -> str:
function make_http_call (line 221) | async def make_http_call(
function send_profile_request (line 284) | async def send_profile_request(
function call_freeze_gc_http (line 331) | async def call_freeze_gc_http(session: aiohttp.ClientSession, http_url: ...
function send_warmup_requests (line 359) | async def send_warmup_requests(
function perform_global_warmup_and_freeze (line 398) | async def perform_global_warmup_and_freeze(
function process_results (line 430) | async def process_results(
function print_csv_results (line 573) | def print_csv_results(all_results: List[Dict[str, Any]]) -> None:
function run_benchmark_main (line 628) | async def run_benchmark_main(
function run_generic_benchmark (line 696) | async def run_generic_benchmark(
FILE: benchmark/react/bench_other.py
function get_prompt (line 12) | def get_prompt(question):
function main (line 86) | def main(args):
FILE: benchmark/react/bench_sglang.py
function webthink (line 14) | def webthink(s, question, triplets):
function main (line 109) | def main(args):
FILE: benchmark/reasoning_benchmark/answer_extraction.py
function _fix_fracs (line 8) | def _fix_fracs(string):
function _fix_a_slash_b (line 40) | def _fix_a_slash_b(string):
function _fix_sqrt (line 57) | def _fix_sqrt(string):
function _fix_tan (line 63) | def _fix_tan(string):
function strip_string (line 69) | def strip_string(string):
function extract_boxed_answers (line 182) | def extract_boxed_answers(text):
function extract_program_output (line 200) | def extract_program_output(pred_str):
function extract_answer (line 214) | def extract_answer(pred_str, exhaust=False):
function extract_math_answer (line 253) | def extract_math_answer(question, reasoning, task):
FILE: benchmark/reasoning_benchmark/bench_sglang.py
function reasoning_gen (line 19) | def reasoning_gen(s, question: str):
function convert_dataset (line 31) | def convert_dataset(path: str, question_key: str, answer_key: str, num_t...
function main (line 44) | def main(args):
FILE: benchmark/reasoning_benchmark/eval_utils.py
function parse_digits (line 11) | def parse_digits(num):
function is_digit (line 28) | def is_digit(num):
function symbolic_equal (line 33) | def symbolic_equal(a, b):
function math_equal (line 59) | def math_equal(prediction, reference, include_percentage=True, is_close=...
FILE: benchmark/tip_suggestion/bench_other.py
function expand_tip (line 15) | def expand_tip(topic, tip, generate):
function suggest_tips (line 34) | def suggest_tips(topic, generate):
function main (line 57) | def main(args):
FILE: benchmark/tip_suggestion/bench_sglang.py
function expand_tip (line 16) | def expand_tip(s, topic, tip):
function suggest_tips (line 36) | def suggest_tips(s, topic):
function main (line 54) | def main(args):
FILE: benchmark/tip_suggestion/lmql_funcs.py
function expand_tip_async (line 4) | async def expand_tip_async(topic, tip, generate):
function suggest_tips_async (line 23) | async def suggest_tips_async(topic, generate):
FILE: benchmark/tree_of_thought_deep/bench_other.py
function get_answer_value (line 18) | def get_answer_value(answer_str):
function most_frequent_number (line 29) | def most_frequent_number(numbers):
function propose_plan (line 47) | def propose_plan(s, question, num_branches, call_generate):
function execute_plan (line 62) | def execute_plan(s, num_branches, call_generate):
function reflect_solution (line 75) | def reflect_solution(s, num_branches, call_generate):
function get_final_answer (line 88) | def get_final_answer(s, num_branches, call_generate):
function tree_search (line 101) | def tree_search(question, num_branches, call_generate):
function main (line 122) | def main(args):
FILE: benchmark/tree_of_thought_deep/bench_sglang.py
function get_answer_value (line 20) | def get_answer_value(answer_str):
function most_frequent_number (line 31) | def most_frequent_number(numbers):
function propose_plan (line 44) | def propose_plan(s, question, num_branches):
function execute_plan (line 54) | def execute_plan(s, num_branches):
function reflect_solution (line 63) | def reflect_solution(s, num_branches):
function get_final_answer (line 72) | def get_final_answer(s, num_branches):
function tree_search (line 82) | def tree_search(s, question, num_branches):
function main (line 104) | def main(args):
FILE: benchmark/tree_of_thought_deep/lmql_funcs.py
function propose_plan_async (line 10) | async def propose_plan_async(s, question, num_branches, call_generate):
function execute_plan_async (line 25) | async def execute_plan_async(s, num_branches, call_generate):
function reflect_solution_async (line 38) | async def reflect_solution_async(s, num_branches, call_generate):
function get_final_answer_async (line 51) | async def get_final_answer_async(s, num_branches, call_generate):
function tree_search_async (line 64) | async def tree_search_async(question, num_branches, call_generate):
FILE: benchmark/tree_of_thought_v0/bench_other.py
function get_answer_value (line 18) | def get_answer_value(answer_str):
function most_frequent_number (line 29) | def most_frequent_number(numbers):
function propose_plan (line 47) | def propose_plan(s, question, num_branches, call_generate):
function execute_plan (line 62) | def execute_plan(s, num_branches, call_generate):
function reflect_solution (line 75) | def reflect_solution(s, num_branches, call_generate):
function tree_search (line 88) | def tree_search(question, num_branches, call_generate):
function main (line 102) | def main(args):
FILE: benchmark/tree_of_thought_v0/bench_sglang.py
function get_answer_value (line 20) | def get_answer_value(answer_str):
function most_frequent_number (line 31) | def most_frequent_number(numbers):
function propose_plan (line 44) | def propose_plan(s, question, num_branches):
function execute_plan (line 54) | def execute_plan(s, num_branches):
function reflect_solution (line 63) | def reflect_solution(s, num_branches):
function tree_search (line 73) | def tree_search(s, question, num_branches):
function main (line 93) | def main(args):
FILE: docs/conf.py
function setup (line 136) | def setup(app):
FILE: docs/deploy.py
function run_cmd (line 7) | def run_cmd(cmd):
FILE: docs/performance_dashboard/app.js
constant GITHUB_REPO (line 3) | const GITHUB_REPO = 'sgl-project/sglang';
constant WORKFLOW_NAME (line 4) | const WORKFLOW_NAME = 'nightly-test-nvidia.yml';
constant ARTIFACT_PREFIX (line 5) | const ARTIFACT_PREFIX = 'consolidated-metrics-';
function init (line 40) | async function init() {
function loadData (line 58) | async function loadData() {
function fetchWorkflowRuns (line 93) | async function fetchWorkflowRuns() {
function fetchMetricsForRun (line 112) | async function fetchMetricsForRun(run) {
function isDiffusionResult (line 151) | function isDiffusionResult(result) {
function populateFilters (line 156) | function populateFilters() {
function formatIoLenLabel (line 227) | function formatIoLenLabel(ioKey) {
function sortIoLengths (line 237) | function sortIoLengths(ioLengths) {
function populateSelectWithLabels (line 247) | function populateSelectWithLabels(selectId, options, labelFormatter) {
function updateIoLenFilter (line 258) | function updateIoLenFilter() {
function updateVariantFilter (line 308) | function updateVariantFilter() {
function populateSelect (line 344) | function populateSelect(selectId, options) {
function populateSelectNoAll (line 354) | function populateSelectNoAll(selectId, options) {
function createMetricTabs (line 368) | function createMetricTabs() {
function detectCurrentDataType (line 396) | function detectCurrentDataType() {
function selectMetricTab (line 416) | function selectMetricTab(metricKey, tabElement) {
function handleModelFilterChange (line 429) | function handleModelFilterChange(model) {
function handleGpuFilterChange (line 441) | function handleGpuFilterChange() {
function updateStats (line 452) | function updateStats() {
function updateCharts (line 498) | function updateCharts() {
function prepareChartData (line 512) | function prepareChartData(gpuFilter, modelFilter, variantFilter, ioLenFi...
function prepareChartDataByBatch (line 581) | function prepareChartDataByBatch(gpuFilter, modelFilter, variantFilter, ...
function updateMetricChart (line 741) | function updateMetricChart(chartDataByBatch, metricType) {
function getChartOptions (line 826) | function getChartOptions(yAxisLabel) {
function escapeHtml (line 880) | function escapeHtml(text) {
function updateRunsTable (line 887) | function updateRunsTable() {
function refreshData (line 937) | async function refreshData() {
function formatNumber (line 944) | function formatNumber(num) {
function getAuthHeaders (line 955) | function getAuthHeaders() {
function checkAuthAndInit (line 964) | async function checkAuthAndInit() {
function handleLogin (line 1014) | async function handleLogin(event) {
FILE: docs/performance_dashboard/fetch_metrics.py
function get_github_token (line 31) | def get_github_token() -> Optional[str]:
function get_headers (line 55) | def get_headers(token: Optional[str]) -> dict:
function fetch_workflow_runs (line 65) | def fetch_workflow_runs(
function fetch_run_artifacts (line 97) | def fetch_run_artifacts(token: Optional[str], run_id: int) -> list:
function download_artifact (line 107) | def download_artifact(token: Optional[str], artifact_id: int) -> Optiona...
function extract_metrics_from_zip (line 128) | def extract_metrics_from_zip(zip_content: bytes) -> Optional[dict]:
function fetch_metrics_for_run (line 144) | def fetch_metrics_for_run(token: Optional[str], run: dict) -> Optional[d...
function fetch_single_run (line 184) | def fetch_single_run(token: Optional[str], run_id: int) -> Optional[dict]:
function main (line 195) | def main():
FILE: docs/performance_dashboard/server.py
function hash_password (line 64) | def hash_password(password):
function create_auth_token (line 69) | def create_auth_token():
function verify_auth_token (line 82) | def verify_auth_token(token):
function get_github_token (line 95) | def get_github_token():
function fetch_metrics_from_github (line 117) | def fetch_metrics_from_github(days=30):
function update_cache_async (line 219) | def update_cache_async():
function start_periodic_refresh (line 237) | def start_periodic_refresh(interval_hours):
class DashboardHandler (line 252) | class DashboardHandler(http.server.SimpleHTTPRequestHandler):
method __init__ (line 255) | def __init__(self, *args, directory=None, **kwargs):
method _send_json (line 258) | def _send_json(self, data, status=200):
method _check_auth (line 266) | def _check_auth(self):
method do_GET (line 278) | def do_GET(self):
method do_POST (line 297) | def do_POST(self):
method handle_auth_check (line 305) | def handle_auth_check(self):
method handle_login (line 309) | def handle_login(self):
method handle_metrics_api (line 335) | def handle_metrics_api(self, parsed):
method handle_refresh_api (line 351) | def handle_refresh_api(self):
method log_message (line 356) | def log_message(self, format, *args):
function main (line 361) | def main():
FILE: docs/release_lookup/generate_index.py
function run_git (line 14) | def run_git(cmd):
function is_stable_release (line 23) | def is_stable_release(tag_name):
function get_tags (line 31) | def get_tags():
function extract_pr_num (line 60) | def extract_pr_num(message):
function process_tag_line (line 75) | def process_tag_line(tags, commit_map, pr_map, tag_type, tag_to_idx):
function main (line 129) | def main():
FILE: docs/wrap_run_llm.py
function insert_runllm_widget (line 5) | def insert_runllm_widget(html_content):
function process_html_files (line 16) | def process_html_files(build_dir):
function main (line 34) | def main():
FILE: examples/checkpoint_engine/update.py
function timer (line 29) | def timer(msg: str):
function check_sglang_ready (line 36) | def check_sglang_ready(
function split_checkpoint_files (line 60) | def split_checkpoint_files(
function split_tensors (line 73) | def split_tensors(
function req_inference (line 94) | def req_inference(
function update_weights (line 123) | def update_weights(
function join (line 161) | def join(
FILE: examples/frontend_language/quick_start/anthropic_example_chat.py
function multi_turn_question (line 11) | def multi_turn_question(s, question_1, question_2):
function single (line 18) | def single():
function stream (line 30) | def stream():
function batch (line 42) | def batch():
FILE: examples/frontend_language/quick_start/anthropic_example_complete.py
function few_shot_qa (line 11) | def few_shot_qa(s, question):
function single (line 24) | def single():
function stream (line 33) | def stream():
function batch (line 43) | def batch():
FILE: examples/frontend_language/quick_start/azure_openai_example_chat.py
function multi_turn_question (line 13) | def multi_turn_question(s, question_1, question_2):
function single (line 21) | def single():
function stream (line 33) | def stream():
function batch (line 45) | def batch():
FILE: examples/frontend_language/quick_start/gemini_example_chat.py
function multi_turn_question (line 11) | def multi_turn_question(s, question_1, question_2):
function single (line 18) | def single():
function stream (line 30) | def stream():
function batch (line 42) | def batch():
FILE: examples/frontend_language/quick_start/gemini_example_complete.py
function few_shot_qa (line 11) | def few_shot_qa(s, question):
function single (line 24) | def single():
function stream (line 33) | def stream():
function batch (line 43) | def batch():
FILE: examples/frontend_language/quick_start/gemini_example_multimodal_chat.py
function image_qa (line 11) | def image_qa(s, image_file1, image_file2, question):
FILE: examples/frontend_language/quick_start/local_example_chat.py
function multi_turn_question (line 10) | def multi_turn_question(s, question_1, question_2):
function single (line 17) | def single():
function stream (line 29) | def stream():
function batch (line 41) | def batch():
FILE: examples/frontend_language/quick_start/local_example_complete.py
function few_shot_qa (line 10) | def few_shot_qa(s, question):
function single (line 23) | def single():
function stream (line 32) | def stream():
function batch (line 42) | def batch():
FILE: examples/frontend_language/quick_start/local_example_llava_next.py
function image_qa (line 10) | def image_qa(s, image_path, question):
function single (line 15) | def single():
function stream (line 22) | def stream():
function batch (line 35) | def batch():
FILE: examples/frontend_language/quick_start/openai_example_chat.py
function multi_turn_question (line 11) | def multi_turn_question(s, question_1, question_2):
function single (line 19) | def single():
function stream (line 31) | def stream():
function batch (line 43) | def batch():
FILE: examples/frontend_language/quick_start/openai_example_complete.py
function few_shot_qa (line 11) | def few_shot_qa(s, question):
function single (line 24) | def single():
function stream (line 33) | def stream():
function batch (line 43) | def batch():
FILE: examples/frontend_language/quick_start/openai_example_n.py
function multi_turn_question (line 11) | def multi_turn_question(s, question_1, question_2):
function single (line 24) | def single():
function batch (line 40) | def batch():
FILE: examples/frontend_language/quick_start/openai_example_o1.py
function multi_turn_question (line 11) | def multi_turn_question(s, question_1, question_2):
function single (line 19) | def single():
function batch (line 31) | def batch():
FILE: examples/frontend_language/quick_start/openrouter_example_chat.py
function multi_turn_question (line 13) | def multi_turn_question(s, question_1, question_2):
function single (line 21) | def single():
function stream (line 33) | def stream():
function batch (line 45) | def batch():
FILE: examples/frontend_language/quick_start/together_example_chat.py
function multi_turn_question (line 13) | def multi_turn_question(s, question_1, question_2):
function single (line 21) | def single():
function stream (line 33) | def stream():
function batch (line 45) | def batch():
FILE: examples/frontend_language/quick_start/together_example_complete.py
function few_shot_qa (line 13) | def few_shot_qa(s, question):
function single (line 26) | def single():
function stream (line 35) | def stream():
function batch (line 45) | def batch():
FILE: examples/frontend_language/usage/chinese_regex.py
function character_gen (line 22) | def character_gen(s, name):
function main (line 45) | def main():
FILE: examples/frontend_language/usage/choices_logprob.py
function tool_use (line 11) | def tool_use(s, question):
function main (line 16) | def main():
FILE: examples/frontend_language/usage/cot_decoding.py
function cot_decoding (line 13) | def cot_decoding(s, question, get_top_k, is_chat_model, verbose):
FILE: examples/frontend_language/usage/json_decode.py
function character_gen (line 33) | def character_gen(s, name):
function driver_character_gen (line 44) | def driver_character_gen():
class Weapon (line 49) | class Weapon(str, Enum):
class Wizard (line 58) | class Wizard(BaseModel):
function pydantic_wizard_gen (line 65) | def pydantic_wizard_gen(s):
function driver_pydantic_wizard_gen (line 75) | def driver_pydantic_wizard_gen():
FILE: examples/frontend_language/usage/json_logprobs.py
function openai_api_request (line 15) | def openai_api_request(name):
function srt_api_request (line 38) | def srt_api_request(name):
function pretty_print (line 70) | def pretty_print(res):
FILE: examples/frontend_language/usage/llava_video/srt_example_llava_v.py
function video_qa (line 20) | def video_qa(s, num_frames, video_path, question):
function single (line 25) | def single(path, num_frames=16):
function split_into_chunks (line 36) | def split_into_chunks(lst, num_chunks):
function save_batch_results (line 50) | def save_batch_results(batch_video_files, states, cur_chunk, batch_idx, ...
function compile_and_cleanup_final_results (line 60) | def compile_and_cleanup_final_results(cur_chunk, num_batches, save_dir):
function find_video_files (line 75) | def find_video_files(video_dir):
function batch (line 90) | def batch(video_dir, save_dir, cur_chunk, num_chunks, num_frames=16, bat...
FILE: examples/frontend_language/usage/openai_chat_speculative.py
function gen_character_spec (line 24) | def gen_character_spec(s):
function gen_character_spec_no_few_shot (line 42) | def gen_character_spec_no_few_shot(s):
function gen_character_normal (line 55) | def gen_character_normal(s):
function multi_turn_question (line 62) | def multi_turn_question(s, question_1, question_2):
function test_spec_single_turn (line 80) | def test_spec_single_turn():
function test_inaccurate_spec_single_turn (line 93) | def test_inaccurate_spec_single_turn():
function test_normal_single_turn (line 103) | def test_normal_single_turn():
function test_spec_multi_turn (line 109) | def test_spec_multi_turn():
function test_spec_multi_turn_stream (line 122) | def test_spec_multi_turn_stream():
FILE: examples/frontend_language/usage/openai_speculative.py
function gen_character_spec (line 10) | def gen_character_spec(s):
function gen_character_no_spec (line 19) | def gen_character_no_spec(s):
function gen_character_spec_no_few_shot (line 28) | def gen_character_spec_no_few_shot(s):
FILE: examples/frontend_language/usage/parallel_sample.py
function parallel_sample (line 10) | def parallel_sample(s, question, n):
FILE: examples/frontend_language/usage/readme_examples.py
function tool_use (line 11) | def tool_use(s, question):
function tip_suggestion (line 26) | def tip_suggestion(s):
function regular_expression_gen (line 43) | def regular_expression_gen(s):
function text_qa (line 53) | def text_qa(s, question):
function driver_tool_use (line 58) | def driver_tool_use():
function driver_tip_suggestion (line 64) | def driver_tip_suggestion():
function driver_regex (line 70) | def driver_regex():
function driver_batching (line 76) | def driver_batching():
function driver_stream (line 91) | def driver_stream():
FILE: examples/frontend_language/usage/sgl_gen_min_tokens.py
function long_answer (line 12) | def long_answer(s):
function short_answer (line 18) | def short_answer(s):
FILE: examples/frontend_language/usage/streaming.py
function multi_turn_question (line 12) | def multi_turn_question(s, question_1, question_2):
function stream_a_variable (line 23) | def stream_a_variable():
function async_stream (line 35) | async def async_stream():
FILE: examples/frontend_language/usage/triton/models/character_generation/1/model.py
class Character (line 12) | class Character(BaseModel):
function character_gen (line 19) | def character_gen(s, name):
class TritonPythonModel (line 29) | class TritonPythonModel:
method initialize (line 30) | def initialize(self, args):
method execute (line 33) | def execute(self, requests):
FILE: examples/profiler/nsys_profile_tools/gputrc2graph.py
function load_engine_model (line 18) | def load_engine_model():
class GPUTrace2Graph (line 34) | class GPUTrace2Graph:
method __init__ (line 39) | def __init__(self):
method gen_nonoverlapped_sum_from_gputrace (line 45) | def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
method sum_non_overlapping_intervals (line 66) | def sum_non_overlapping_intervals(self, df):
method make_html (line 107) | def make_html(self, df, output_dir, title):
method anno_gpu_kernname (line 165) | def anno_gpu_kernname(self, df, mapping):
method make_nongpu_row (line 175) | def make_nongpu_row(self, df, nongpu_sec):
method is_valid_file (line 183) | def is_valid_file(self, base_file):
method should_gen_file (line 189) | def should_gen_file(self, new_file, base_file):
method gen_sum_file (line 203) | def gen_sum_file(self, file, nsys_cmd):
method gen_graph (line 250) | def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
function parse_tuple (line 290) | def parse_tuple(s):
function main (line 294) | def main():
FILE: examples/runtime/engine/custom_server.py
function generate (line 14) | async def generate(request):
function generate_stream (line 26) | async def generate_stream(request):
function run_server (line 46) | def run_server():
FILE: examples/runtime/engine/embedding.py
function main (line 4) | def main():
FILE: examples/runtime/engine/fastapi_engine_inference.py
function lifespan (line 26) | async def lifespan(app: FastAPI):
function generate_text (line 47) | async def generate_text(request: Request):
function start_server (line 78) | def start_server(args, timeout=60):
function send_requests (line 129) | def send_requests(server_url, prompts, max_new_tokens, temperature):
FILE: examples/runtime/engine/launch_engine.py
function main (line 8) | def main():
FILE: examples/runtime/engine/offline_batch_inference.py
function main (line 13) | def main(
FILE: examples/runtime/engine/offline_batch_inference_async.py
class InferenceEngine (line 19) | class InferenceEngine:
method __init__ (line 20) | def __init__(self, **kwargs):
method generate (line 23) | async def generate(self, prompt, sampling_params):
function run_server (line 28) | async def run_server(server_args):
FILE: examples/runtime/engine/offline_batch_inference_eagle.py
function main (line 4) | def main():
FILE: examples/runtime/engine/offline_batch_inference_qwen_1m.py
function load_prompt (line 11) | def load_prompt() -> str:
function process_requests (line 29) | def process_requests(llm: sgl.Engine, prompts: list[str]) -> None:
function initialize_engine (line 50) | def initialize_engine() -> sgl.Engine:
function main (line 67) | def main():
FILE: examples/runtime/engine/offline_batch_inference_vlm.py
function main (line 14) | def main(
FILE: examples/runtime/engine/save_remote_state.py
function main (line 45) | def main(args):
FILE: examples/runtime/engine/save_sharded_state.py
function main (line 50) | def main(args):
FILE: examples/runtime/hidden_states/hidden_states_engine.py
function main (line 15) | def main():
FILE: examples/runtime/hidden_states/hidden_states_server.py
function main (line 23) | def main():
FILE: examples/runtime/lora.py
function main (line 16) | def main():
FILE: examples/runtime/multimodal/llama3_llava_server.py
function send_request (line 27) | async def send_request(url, data, delay=0):
function test_concurrent (line 35) | async def test_concurrent(args):
function test_streaming (line 68) | def test_streaming(args):
FILE: examples/runtime/multimodal/llava_onevision_server.py
function download_video (line 27) | def download_video(url, cache_dir):
function create_openai_client (line 41) | def create_openai_client(base_url):
function image_stream_request_test (line 45) | def image_stream_request_test(client):
function multi_image_stream_request_test (line 82) | def multi_image_stream_request_test(client):
function video_stream_request_test (line 129) | def video_stream_request_test(client, video_path):
function image_speed_test (line 152) | def image_speed_test(client):
function video_speed_test (line 184) | def video_speed_test(client, video_path):
function prepare_video_messages (line 202) | def prepare_video_messages(video_path):
function print_speed_test_results (line 236) | def print_speed_test_results(request, start_time, end_time):
function main (line 250) | def main():
FILE: examples/runtime/multimodal/pixtral_server.py
function send_request (line 28) | async def send_request(url, data, delay=0):
function test_concurrent (line 36) | async def test_concurrent(args):
function test_streaming (line 73) | def test_streaming(args):
FILE: examples/runtime/multimodal/qwen_llava_server.py
function send_request (line 27) | async def send_request(url, data, delay=0):
function test_concurrent (line 35) | async def test_concurrent(args):
function test_streaming (line 68) | def test_streaming(args):
FILE: examples/runtime/qwen3_vl_reranker.py
function rerank_text_only (line 25) | def rerank_text_only():
function rerank_with_images (line 51) | def rerank_with_images():
function rerank_multimodal_query (line 106) | def rerank_multimodal_query():
function main (line 156) | def main():
FILE: examples/runtime/token_in_token_out/token_in_token_out_llm_engine.py
function main (line 11) | def main():
FILE: examples/runtime/token_in_token_out/token_in_token_out_llm_server.py
function main (line 23) | def main():
FILE: examples/runtime/token_in_token_out/token_in_token_out_vlm_engine.py
function get_input_ids (line 14) | def get_input_ids(
function token_in_out_example (line 37) | def token_in_out_example(
FILE: examples/runtime/token_in_token_out/token_in_token_out_vlm_server.py
function get_input_ids (line 26) | def get_input_ids() -> Tuple[list[int], list]:
function main (line 45) | def main():
FILE: examples/runtime/vertex_predict.py
class VertexPrediction (line 33) | class VertexPrediction:
class LocalVertexEndpoint (line 37) | class LocalVertexEndpoint:
method __init__ (line 38) | def __init__(self) -> None:
method predict (line 41) | def predict(self, instances: List[dict], parameters: Optional[dict] = ...
FILE: examples/usage/modelopt_quantize_and_export.py
function _validate_export (line 26) | def _validate_export(export_dir: str) -> bool:
function _get_export_info (line 56) | def _get_export_info(export_dir: str) -> Optional[dict]:
function quantize_and_export_model (line 78) | def quantize_and_export_model(
function deploy_exported_model (line 175) | def deploy_exported_model(
function main (line 224) | def main():
FILE: python/sglang/_mps_stub.py
class Stream (line 17) | class Stream:
method __init__ (line 24) | def __init__(self, device: Any = None, priority: int = 0) -> None:
method synchronize (line 27) | def synchronize(self) -> None:
method wait_stream (line 30) | def wait_stream(self, stream: Any) -> None:
method wait_event (line 33) | def wait_event(self, event: Any) -> None:
method record_event (line 36) | def record_event(self, event: Any = None) -> Any:
method query (line 39) | def query(self) -> bool:
method __enter__ (line 43) | def __enter__(self) -> "Stream":
method __exit__ (line 46) | def __exit__(self, *args: Any) -> None:
class Event (line 50) | class Event:
method __init__ (line 53) | def __init__(self, enable_timing: bool = False) -> None:
method record (line 56) | def record(self, stream: Any = None) -> None:
method wait (line 59) | def wait(self, stream: Any = None) -> None:
method query (line 62) | def query(self) -> bool:
method synchronize (line 65) | def synchronize(self) -> None:
method elapsed_time (line 68) | def elapsed_time(self, end_event: Any) -> float:
function current_stream (line 75) | def current_stream(device: Any = None) -> Stream:
function stream (line 80) | def stream(s: Any) -> Stream:
function set_device (line 85) | def set_device(device: Any) -> None: # noqa: ARG001
function current_device (line 90) | def current_device() -> int:
function device_count (line 95) | def device_count() -> int:
class _MPSDeviceProperties (line 101) | class _MPSDeviceProperties:
method __getattr__ (line 114) | def __getattr__(self, name: str) -> Any:
function get_device_properties (line 125) | def get_device_properties(device: Any = 0) -> _MPSDeviceProperties: # n...
class _MPSMemoryTracker (line 137) | class _MPSMemoryTracker:
method __init__ (line 145) | def __init__(self) -> None:
method memory_allocated (line 149) | def memory_allocated(self, device: Any = None) -> int: # noqa: ARG002
method memory_reserved (line 157) | def memory_reserved(self, device: Any = None) -> int: # noqa: ARG002
method max_memory_allocated (line 165) | def max_memory_allocated(self, device: Any = None) -> int: # noqa: AR...
method max_memory_reserved (line 169) | def max_memory_reserved(self, device: Any = None) -> int: # noqa: ARG002
method reset_peak_memory_stats (line 173) | def reset_peak_memory_stats(self, device: Any = None) -> None: # noqa...
function _patch_non_blocking (line 183) | def _patch_non_blocking() -> None:
function install (line 226) | def install() -> None:
FILE: python/sglang/_triton_stub.py
class _StubBase (line 19) | class _StubBase:
method __init_subclass__ (line 25) | def __init_subclass__(cls, **kwargs):
class _MockModule (line 29) | class _MockModule(types.ModuleType):
method __init__ (line 37) | def __init__(self, name: str):
method __getattr__ (line 48) | def __getattr__(self, name: str):
method __call__ (line 66) | def __call__(self, *args, **kwargs):
method __instancecheck__ (line 77) | def __instancecheck__(self, instance):
method __contains__ (line 81) | def __contains__(self, item):
method __iter__ (line 85) | def __iter__(self):
method __len__ (line 88) | def __len__(self):
method __bool__ (line 91) | def __bool__(self):
method __repr__ (line 94) | def __repr__(self):
function _cdiv (line 98) | def _cdiv(a: int, b: int) -> int:
function _next_power_of_2 (line 103) | def _next_power_of_2(n: int) -> int:
class _Config (line 108) | class _Config:
method __init__ (line 111) | def __init__(self, kwargs=None, num_warps=4, num_stages=2, **extra):
class _TritonFinder (line 117) | class _TritonFinder:
method find_module (line 128) | def find_module(self, fullname, path=None):
method load_module (line 133) | def load_module(self, fullname):
function _make_mock (line 148) | def _make_mock(name: str) -> _MockModule:
function install (line 155) | def install() -> None:
FILE: python/sglang/bench_offline_throughput.py
class BenchArgs (line 35) | class BenchArgs:
method add_cli_args (line 63) | def add_cli_args(parser: argparse.ArgumentParser):
method from_cli_args (line 201) | def from_cli_args(cls, args: argparse.Namespace):
function throughput_test_once (line 206) | def throughput_test_once(
function monitor_trace_file (line 294) | def monitor_trace_file(known_files, directory, interval=1):
function _create_ray_engine_backend (line 326) | def _create_ray_engine_backend(server_args: ServerArgs):
function throughput_test (line 394) | def throughput_test(
FILE: python/sglang/bench_one_batch.py
function start_profile (line 93) | def start_profile(profile_activities, profile_record_shapes=False, rank_...
function stop_profile (line 124) | def stop_profile(
class BenchArgs (line 158) | class BenchArgs:
method add_cli_args (line 178) | def add_cli_args(parser: argparse.ArgumentParser):
method from_cli_args (line 245) | def from_cli_args(cls, args: argparse.Namespace):
function load_model (line 259) | def load_model(server_args, port_args, gpu_id, tp_rank):
function prepare_inputs_for_correctness_test (line 289) | def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_pr...
function prepare_extend_inputs_for_correctness_test (line 334) | def prepare_extend_inputs_for_correctness_test(
function prepare_synthetic_inputs_for_latency_test (line 348) | def prepare_synthetic_inputs_for_latency_test(
class TreeCacheNamespace (line 377) | class TreeCacheNamespace(SimpleNamespace):
method supports_swa (line 378) | def supports_swa(self) -> bool:
method supports_mamba (line 381) | def supports_mamba(self) -> bool:
method is_chunk_cache (line 384) | def is_chunk_cache(self) -> bool:
method is_tree_cache (line 387) | def is_tree_cache(self) -> bool:
method evict (line 390) | def evict(self, params: EvictParams):
function extend (line 395) | def extend(reqs, model_runner):
function decode (line 422) | def decode(input_token_ids, batch, model_runner):
function _maybe_prepare_mlp_sync_batch (line 433) | def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
function _read_prompts_from_file (line 448) | def _read_prompts_from_file(prompt_file, rank_print):
function _get_torch_profiler_output_dir (line 461) | def _get_torch_profiler_output_dir():
function _create_torch_profiler_filename (line 465) | def _create_torch_profiler_filename(
function _save_profile_trace_results (line 473) | def _save_profile_trace_results(profiler, filename):
function correctness_test (line 484) | def correctness_test(
function synchronize (line 533) | def synchronize(device):
function latency_test_run_once (line 537) | def latency_test_run_once(
function latency_test (line 676) | def latency_test(
function main (line 791) | def main(server_args, bench_args):
FILE: python/sglang/bench_one_batch_server.py
function run_benchmark (line 25) | def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
FILE: python/sglang/bench_serving.py
function _get_bool_env_var (line 60) | def _get_bool_env_var(name: str, default: str = "false") -> bool:
function _create_bench_client_session (line 65) | def _create_bench_client_session():
class RequestFuncInput (line 79) | class RequestFuncInput:
class RequestFuncOutput (line 93) | class RequestFuncOutput:
method init_new (line 106) | def init_new(request_func_input: RequestFuncInput):
function get_auth_headers (line 112) | def get_auth_headers() -> Dict[str, str]:
function get_request_headers (line 123) | def get_request_headers() -> Dict[str, str]:
function wait_for_endpoint (line 130) | def wait_for_endpoint(url: str, timeout_sec: int = 60) -> bool:
function async_request_trt_llm (line 153) | async def async_request_trt_llm(
function async_request_openai_completions (line 224) | async def async_request_openai_completions(
function async_request_openai_chat_completions (line 338) | async def async_request_openai_chat_completions(
function async_request_truss (line 518) | async def async_request_truss(
function async_request_sglang_generate (line 598) | async def async_request_sglang_generate(
function async_request_openai_embeddings (line 704) | async def async_request_openai_embeddings(
function async_request_gserver (line 754) | async def async_request_gserver(
function async_request_profile (line 761) | async def async_request_profile(api_url: str) -> RequestFuncOutput:
function _build_profile_urls (line 816) | def _build_profile_urls(
function _call_profile_pd (line 835) | async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: st...
class BenchmarkMetrics (line 875) | class BenchmarkMetrics:
function get_request (line 912) | async def get_request(
function calculate_metrics (line 952) | def calculate_metrics(
function wrap_multi_turn_request_func (line 1130) | def wrap_multi_turn_request_func(request_func: Callable, backend: str) -...
function benchmark (line 1163) | async def benchmark(
function check_chat_template (line 1638) | def check_chat_template(model_path):
function set_global_args (line 1647) | def set_global_args(args_: argparse.Namespace):
function run_benchmark (line 1653) | def run_benchmark(args_: argparse.Namespace):
class LoRAPathAction (line 1890) | class LoRAPathAction(argparse.Action):
method __call__ (line 1891) | def __call__(self, parser, namespace, values, option_string=None):
FILE: python/sglang/benchmark/bench_utils.py
function run_bench (line 7) | def run_bench(
FILE: python/sglang/benchmark/datasets/__init__.py
function get_dataset (line 30) | def get_dataset(args, tokenizer, model_id=None):
FILE: python/sglang/benchmark/datasets/common.py
class DatasetRow (line 22) | class DatasetRow:
method __post_init__ (line 33) | def __post_init__(self):
class BaseDataset (line 43) | class BaseDataset(ABC):
method from_args (line 46) | def from_args(cls, args: Namespace) -> "BaseDataset": ...
method load (line 49) | def load(
function compute_random_lens (line 56) | def compute_random_lens(full_len: int, range_ratio: float, num: int) -> ...
function get_available_tokens (line 68) | def get_available_tokens(tokenizer):
function gen_prompt (line 73) | def gen_prompt(tokenizer, token_num):
function gen_mm_prompt (line 80) | def gen_mm_prompt(tokenizer, image_pad_id, token_num):
FILE: python/sglang/benchmark/datasets/custom.py
class CustomDataset (line 20) | class CustomDataset(BaseDataset):
method from_args (line 29) | def from_args(cls, args: Namespace) -> "CustomDataset":
method load (line 40) | def load(
function sample_custom_requests (line 54) | def sample_custom_requests(
FILE: python/sglang/benchmark/datasets/generated_shared_prefix.py
class GeneratedSharedPrefixDataset (line 23) | class GeneratedSharedPrefixDataset(BaseDataset):
method from_args (line 37) | def from_args(cls, args: Namespace) -> "GeneratedSharedPrefixDataset":
method load (line 53) | def load(
function get_gen_prefix_cache_path (line 72) | def get_gen_prefix_cache_path(
function sample_generated_shared_prefix_requests (line 92) | def sample_generated_shared_prefix_requests(
FILE: python/sglang/benchmark/datasets/image.py
class ImageDataset (line 22) | class ImageDataset(BaseDataset):
method from_args (line 35) | def from_args(cls, args: Namespace) -> "ImageDataset":
method load (line 49) | def load(self, tokenizer=None, model_id=None) -> List[DatasetRow]:
function parse_image_resolution (line 66) | def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
function create_mm_data_row (line 96) | def create_mm_data_row(
function sample_image_requests (line 170) | def sample_image_requests(
FILE: python/sglang/benchmark/datasets/mmmu.py
class MMMUDataset (line 17) | class MMMUDataset(BaseDataset):
method from_args (line 23) | def from_args(cls, args: Namespace) -> "MMMUDataset":
method load (line 30) | def load(self, tokenizer=None, model_id=None) -> List[DatasetRow]:
function sample_mmmu_requests (line 40) | def sample_mmmu_requests(
FILE: python/sglang/benchmark/datasets/mooncake.py
class MooncakeDataset (line 20) | class MooncakeDataset(BaseDataset):
method from_args (line 26) | def from_args(cls, args: Namespace) -> "MooncakeDataset":
method load (line 33) | def load(self, tokenizer=None, model_id=None) -> List[Dict]:
function get_mooncake_request_over_time (line 50) | async def get_mooncake_request_over_time(
FILE: python/sglang/benchmark/datasets/openai_dataset.py
class OpenAIDataset (line 13) | class OpenAIDataset(BaseDataset):
method from_args (line 19) | def from_args(cls, args: Namespace) -> "OpenAIDataset":
method load (line 26) | def load(
function sample_openai_requests (line 37) | def sample_openai_requests(
FILE: python/sglang/benchmark/datasets/random.py
class RandomDataset (line 21) | class RandomDataset(BaseDataset):
method from_args (line 31) | def from_args(cls, args: Namespace) -> "RandomDataset":
method load (line 42) | def load(
function sample_random_requests (line 57) | def sample_random_requests(
FILE: python/sglang/benchmark/datasets/sharegpt.py
class ShareGPTDataset (line 25) | class ShareGPTDataset(BaseDataset):
method from_args (line 34) | def from_args(cls, args: Namespace) -> "ShareGPTDataset":
method load (line 45) | def load(
function sample_sharegpt_requests (line 59) | def sample_sharegpt_requests(
FILE: python/sglang/benchmark/utils.py
function remove_prefix (line 17) | def remove_prefix(text: str, prefix: str) -> str:
function remove_suffix (line 21) | def remove_suffix(text: str, suffix: str) -> str:
function parse_custom_headers (line 25) | def parse_custom_headers(header_list: List[str]) -> Dict[str, str]:
function get_model (line 29) | def get_model(pretrained_model_name_or_path: str) -> str:
function get_tokenizer (line 44) | def get_tokenizer(
function get_processor (line 67) | def get_processor(
function download_and_cache_hf_file (line 90) | def download_and_cache_hf_file(
function download_and_cache_file (line 101) | def download_and_cache_file(url: str, filename: Optional[str] = None):
function is_file_valid_json (line 135) | def is_file_valid_json(path):
function set_ulimit (line 151) | def set_ulimit(target_soft_limit=65535):
FILE: python/sglang/check_env.py
function is_cuda_v2 (line 16) | def is_cuda_v2():
class BaseEnv (line 56) | class BaseEnv:
method __init__ (line 59) | def __init__(self):
method get_info (line 63) | def get_info(self) -> dict:
method get_topology (line 70) | def get_topology(self) -> dict:
method get_package_versions (line 73) | def get_package_versions(self) -> dict:
method get_device_info (line 87) | def get_device_info(self):
method get_hypervisor_vendor (line 113) | def get_hypervisor_vendor(self) -> dict:
method get_ulimit_soft (line 123) | def get_ulimit_soft(self) -> dict:
method check_env (line 127) | def check_env(self):
class GPUEnv (line 144) | class GPUEnv(BaseEnv):
method get_info (line 147) | def get_info(self):
method _get_cuda_version_info (line 156) | def _get_cuda_version_info(self):
method _get_nvcc_info (line 170) | def _get_nvcc_info(self):
method _get_cuda_driver_version (line 193) | def _get_cuda_driver_version(self):
method get_topology (line 214) | def get_topology(self):
class HIPEnv (line 235) | class HIPEnv(BaseEnv):
method get_info (line 238) | def get_info(self):
method _get_cuda_version_info (line 247) | def _get_cuda_version_info(self):
method _get_hipcc_info (line 258) | def _get_hipcc_info(self):
method _get_rocm_driver_version (line 276) | def _get_rocm_driver_version(self):
method get_topology (line 294) | def get_topology(self):
class NPUEnv (line 310) | class NPUEnv(BaseEnv):
method __init__ (line 319) | def __init__(self):
method get_info (line 323) | def get_info(self):
method get_device_info (line 331) | def get_device_info(self):
method _get_cann_version_info (line 346) | def _get_cann_version_info(self):
method _get_cann_info (line 365) | def _get_cann_info(self, CANN_HOME: str):
method _get_ascend_driver_version (line 384) | def _get_ascend_driver_version(self):
method get_topology (line 407) | def get_topology(self):
class MUSAEnv (line 425) | class MUSAEnv(BaseEnv):
method get_info (line 428) | def get_info(self):
method _get_musa_version_info (line 437) | def _get_musa_version_info(self):
method _get_mcc_info (line 451) | def _get_mcc_info(self):
method _get_musa_driver_version (line 472) | def _get_musa_driver_version(self):
method get_topology (line 494) | def get_topology(self):
FILE: python/sglang/cli/generate.py
function generate (line 6) | def generate(args, extra_argv):
FILE: python/sglang/cli/main.py
function version (line 7) | def version(args, extra_argv):
function main (line 12) | def main():
FILE: python/sglang/cli/serve.py
function _extract_model_type_override (line 16) | def _extract_model_type_override(extra_argv):
function serve (line 49) | def serve(args, extra_argv):
FILE: python/sglang/cli/utils.py
function _is_diffusers_model_dir (line 12) | def _is_diffusers_model_dir(model_dir: str) -> bool:
function get_is_diffusion_model (line 24) | def get_is_diffusion_model(model_path: str) -> bool:
function get_model_path (line 65) | def get_model_path(extra_argv):
function get_git_commit_hash (line 94) | def get_git_commit_hash() -> str:
FILE: python/sglang/compile_deep_gemm.py
class CompileArgs (line 40) | class CompileArgs:
method add_cli_args (line 44) | def add_cli_args(parser: argparse.ArgumentParser):
method from_cli_args (line 48) | def from_cli_args(cls, args: argparse.Namespace):
function warm_up_compile (line 57) | async def warm_up_compile(
function launch_server_internal (line 76) | def launch_server_internal(server_args):
function launch_server_process_and_send_one_request (line 85) | def launch_server_process_and_send_one_request(
function refine_server_args (line 145) | def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
function run_compile (line 156) | def run_compile(server_args: ServerArgs, compile_args: CompileArgs):
FILE: python/sglang/eval/llama3_eval.py
function fetch_responses (line 39) | async def fetch_responses(
class CustomAsyncHTTPXClient (line 79) | class CustomAsyncHTTPXClient(httpx.AsyncClient):
method send (line 80) | async def send(self, request: httpx.Request, *args, **kwargs) -> httpx...
function get_client (line 87) | def get_client(provider):
function benchmark (line 103) | async def benchmark(args):
function get_mmlu_answer (line 144) | def get_mmlu_answer(response):
function get_mmlu_cot_answer (line 150) | def get_mmlu_cot_answer(response):
function get_answer_gsm8k (line 172) | def get_answer_gsm8k(response):
function get_dataset_from_task (line 190) | def get_dataset_from_task(task, response_path, model_size):
function analyze (line 221) | def analyze(task, response_path, model_size):
FILE: python/sglang/eval/loogle_eval.py
function get_client (line 15) | def get_client(api_url: str) -> openai.AsyncOpenAI:
function get_dataset (line 21) | def get_dataset():
function fetch_response (line 25) | async def fetch_response(
function benchmark (line 66) | async def benchmark(args):
function analyse (line 98) | def analyse(args):
FILE: python/sglang/global_config.py
class GlobalConfig (line 6) | class GlobalConfig:
method __init__ (line 11) | def __init__(self):
FILE: python/sglang/jit_kernel/__main__.py
function generate_clangd (line 4) | def generate_clangd():
FILE: python/sglang/jit_kernel/add_constant.py
function _jit_add_constant_module (line 14) | def _jit_add_constant_module(constant: int) -> Module:
function add_constant (line 24) | def add_constant(src: torch.Tensor, constant: int) -> torch.Tensor:
FILE: python/sglang/jit_kernel/awq_dequantize.py
function _jit_awq_dequantize_module (line 14) | def _jit_awq_dequantize_module(dtype: torch.dtype) -> Module:
function awq_dequantize (line 24) | def awq_dequantize(
FILE: python/sglang/jit_kernel/awq_marlin_repack.py
function _jit_awq_marlin_repack_module (line 14) | def _jit_awq_marlin_repack_module() -> Module:
function awq_marlin_repack (line 22) | def awq_marlin_repack(
function awq_marlin_moe_repack (line 40) | def awq_marlin_moe_repack(
FILE: python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py
function check_correctness (line 29) | def check_correctness():
function benchmark (line 85) | def benchmark(qweight_row, qweight_col, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_awq_marlin_moe_repack.py
function awq_pack (line 24) | def awq_pack(q_w, num_bits, size_k, size_n):
function make_moe_weights (line 37) | def make_moe_weights(num_experts, size_k, size_n, num_bits, group_size):
function check_correctness (line 54) | def check_correctness():
function benchmark (line 102) | def benchmark(num_experts, size_k, size_n, num_bits, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_awq_marlin_repack.py
function awq_pack (line 25) | def awq_pack(q_w, num_bits, size_k, size_n):
function check_correctness (line 45) | def check_correctness():
function benchmark (line 86) | def benchmark(size_k, size_n, num_bits, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_concat_mla.py
function aot_concat_mla_k (line 27) | def aot_concat_mla_k(k, k_nope, k_rope):
function jit_concat_mla_k (line 31) | def jit_concat_mla_k(k, k_nope, k_rope):
function torch_concat_mla_k (line 35) | def torch_concat_mla_k(k, k_nope, k_rope):
function aot_concat_mla_absorb_q (line 41) | def aot_concat_mla_absorb_q(a, b):
function jit_concat_mla_absorb_q (line 45) | def jit_concat_mla_absorb_q(a, b):
function torch_concat_mla_absorb_q (line 49) | def torch_concat_mla_absorb_q(a, b, out):
function _create_concat_mla_k_data (line 65) | def _create_concat_mla_k_data(num_tokens):
function bench_concat_mla_k (line 102) | def bench_concat_mla_k(num_tokens: int, provider: str):
function bench_concat_mla_absorb_q (line 137) | def bench_concat_mla_absorb_q(dim_0: int, dim_1: int, provider: str):
FILE: python/sglang/jit_kernel/benchmark/bench_fused_add_rmsnorm.py
function sglang_jit_fused_add_rmsnorm (line 14) | def sglang_jit_fused_add_rmsnorm(
function flashinfer_fused_add_rmsnorm (line 20) | def flashinfer_fused_add_rmsnorm(
function benchmark (line 56) | def benchmark(hidden_size: int, batch_size: int, provider: str):
FILE: python/sglang/jit_kernel/benchmark/bench_fused_norm_scale_shift.py
function preprocess_layer (line 37) | def preprocess_layer(layer, affine: bool, D: int, DTYPE: torch.dtype):
function bench_fused_norm_scale_shift (line 65) | def bench_fused_norm_scale_shift(
function bench_fused_scale_residual_norm_scale_shift (line 100) | def bench_fused_scale_residual_norm_scale_shift(
FILE: python/sglang/jit_kernel/benchmark/bench_gptq_marlin.py
function _run_gemm (line 29) | def _run_gemm(fn, a):
function _run_gemm_aot (line 51) | def _run_gemm_aot(a):
function check_correctness (line 73) | def check_correctness():
function benchmark (line 112) | def benchmark(size_m, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_gptq_marlin_repack.py
function _get_inputs (line 24) | def _get_inputs(size_k):
function check_correctness (line 37) | def check_correctness():
function benchmark (line 79) | def benchmark(size_k, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_hadamard.py
function torch_hadamard_transform (line 47) | def torch_hadamard_transform(x, scale, H, dim, dim_padded):
function benchmark (line 85) | def benchmark(batch_size: int, dim: int, provider: str) -> Tuple[float, ...
FILE: python/sglang/jit_kernel/benchmark/bench_hicache.py
class HiCacheCache (line 44) | class HiCacheCache:
method get_slice (line 50) | def get_slice(self, num_layers: int, element_size: int) -> "HiCacheCac...
function gen_indices (line 67) | def gen_indices(
function sglang_aot_transfer_one (line 79) | def sglang_aot_transfer_one(
function sglang_jit_transfer_one (line 100) | def sglang_jit_transfer_one(
function sglang_aot_transfer_all (line 121) | def sglang_aot_transfer_all(
function sglang_jit_transfer_all (line 144) | def sglang_jit_transfer_all(
function pytorch_transfer (line 168) | def pytorch_transfer(
function benchmark_one_layer_h2d (line 218) | def benchmark_one_layer_h2d(
function _create_ptr_tensor (line 299) | def _create_ptr_tensor(tensors, device="cuda"):
function benchmark_all_layer_d2h (line 321) | def benchmark_all_layer_d2h(
FILE: python/sglang/jit_kernel/benchmark/bench_moe_wna16_marlin.py
function stack_and_dev (line 18) | def stack_and_dev(tensors):
function _make_inputs (line 48) | def _make_inputs(size_m):
function _run_jit (line 76) | def _run_jit(
function _run_aot (line 116) | def _run_aot(
function check_correctness (line 156) | def check_correctness():
function benchmark (line 204) | def benchmark(size_m, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_norm.py
function benchmark_rmsnorm (line 48) | def benchmark_rmsnorm(hidden_size: int, batch_size: int, provider: str):
function benchmark_fused_add_rmsnorm (line 72) | def benchmark_fused_add_rmsnorm(hidden_size: int, batch_size: int, provi...
FILE: python/sglang/jit_kernel/benchmark/bench_norm_impls.py
function effective_rows_from_shape (line 158) | def effective_rows_from_shape(input_shape: list[int]) -> int:
function ensure_repo (line 165) | def ensure_repo(repo_name: str, repo_url: str) -> Path:
function ensure_python_dep (line 178) | def ensure_python_dep(module_name: str, package_name: str | None = None)...
function dtype_from_name (line 189) | def dtype_from_name(name: str) -> torch.dtype:
function dtype_name (line 201) | def dtype_name(dtype: torch.dtype) -> str:
function normalize_hidden_sizes (line 210) | def normalize_hidden_sizes(text: str) -> list[int]:
function normalize_dtypes (line 214) | def normalize_dtypes(text: str) -> list[torch.dtype]:
function prewarm (line 218) | def prewarm(fn: Callable[[], object], iters: int = 3) -> None:
function benchmark_provider (line 224) | def benchmark_provider(
function geometric_mean (line 251) | def geometric_mean(values: list[float]) -> float:
function load_flaggems (line 258) | def load_flaggems():
function load_quack (line 272) | def load_quack():
function build_rmsnorm_providers (line 286) | def build_rmsnorm_providers(dtype: torch.dtype, batch_size: int, hidden_...
function build_fused_add_rmsnorm_providers (line 318) | def build_fused_add_rmsnorm_providers(
function build_layernorm_providers (line 370) | def build_layernorm_providers(dtype: torch.dtype, batch_size: int, hidde...
function maybe_benchmark (line 404) | def maybe_benchmark(
function write_csv (line 451) | def write_csv(rows: list[dict[str, object]], output_path: Path) -> None:
function write_markdown (line 478) | def write_markdown(rows: list[dict[str, object]], output_path: Path) -> ...
function run_suite (line 561) | def run_suite(
function run_shape_suite (line 620) | def run_shape_suite(
function main (line 686) | def main() -> None:
FILE: python/sglang/jit_kernel/benchmark/bench_nvfp4_blockwise_moe.py
function _round_up (line 22) | def _round_up(x: int, y: int) -> int:
function _expert_offsets (line 26) | def _expert_offsets(m_per_expert: list[int], device: torch.device) -> to...
function _blockscale_offsets (line 33) | def _blockscale_offsets(m_per_expert: list[int], device: torch.device) -...
function _prepare_case (line 40) | def _prepare_case(
function _torch_ref_group_mm (line 130) | def _torch_ref_group_mm(case: dict[str, Any]) -> torch.Tensor:
function _aot_cutlass_fp4_group_mm (line 143) | def _aot_cutlass_fp4_group_mm(case: dict[str, Any]) -> torch.Tensor:
function _probe_legacy_aot_group_mm (line 171) | def _probe_legacy_aot_group_mm() -> tuple[bool, str]:
function benchmark (line 227) | def benchmark(total_tokens, n, k, num_experts, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_nvfp4_quant.py
function _torch_ref_quant (line 23) | def _torch_ref_quant(input: torch.Tensor, input_global_scale: torch.Tens...
function _aot_scaled_fp4_quant (line 48) | def _aot_scaled_fp4_quant(input: torch.Tensor, input_global_scale: torch...
function _probe_legacy_aot_quant (line 63) | def _probe_legacy_aot_quant() -> tuple[bool, str]:
function _probe_flashinfer_quant (line 92) | def _probe_flashinfer_quant() -> tuple[bool, str]:
function benchmark (line 157) | def benchmark(m, n, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_nvfp4_scaled_mm.py
function _dequantize_to_fp16 (line 37) | def _dequantize_to_fp16(
function _aot_cutlass_scaled_fp4_mm (line 61) | def _aot_cutlass_scaled_fp4_mm(
function _probe_legacy_aot_scaled_mm (line 76) | def _probe_legacy_aot_scaled_mm() -> tuple[bool, str]:
function benchmark (line 143) | def benchmark(m, n, k, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_per_tensor_quant_fp8.py
function vllm_scaled_fp8_quant (line 28) | def vllm_scaled_fp8_quant(
function sglang_scaled_fp8_quant (line 37) | def sglang_scaled_fp8_quant(
function calculate_diff (line 52) | def calculate_diff(batch_size: int, seq_len: int):
function benchmark (line 99) | def benchmark(element_count, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_per_token_group_quant_8bit.py
function _flatten_to_2d (line 159) | def _flatten_to_2d(t: torch.Tensor) -> torch.Tensor:
function _make_sglang_bench_fn (line 166) | def _make_sglang_bench_fn(
function benchmark (line 249) | def benchmark(
FILE: python/sglang/jit_kernel/benchmark/bench_qknorm.py
function sglang_aot_qknorm (line 20) | def sglang_aot_qknorm(
function sglang_jit_qknorm (line 39) | def sglang_jit_qknorm(
function flashinfer_qknorm (line 49) | def flashinfer_qknorm(
function torch_impl_qknorm (line 62) | def torch_impl_qknorm(
function benchmark (line 114) | def benchmark(
FILE: python/sglang/jit_kernel/benchmark/bench_qknorm_across_heads.py
function sglang_jit_qknorm_across_heads (line 18) | def sglang_jit_qknorm_across_heads(
function sglang_aot_qknorm_across_heads (line 28) | def sglang_aot_qknorm_across_heads(
function flashinfer_qknorm_across_heads (line 43) | def flashinfer_qknorm_across_heads(
function torch_impl_qknorm_across_heads (line 56) | def torch_impl_qknorm_across_heads(
function benchmark (line 101) | def benchmark(
FILE: python/sglang/jit_kernel/benchmark/bench_qwen_image_modulation.py
function _make_common_inputs (line 28) | def _make_common_inputs(batch_size: int, seq_len: int, hidden_size: int):
function bench_layernorm_scale_shift_gate_select01 (line 55) | def bench_layernorm_scale_shift_gate_select01(
function bench_residual_layernorm_scale_shift_gate_select01 (line 116) | def bench_residual_layernorm_scale_shift_gate_select01(
FILE: python/sglang/jit_kernel/benchmark/bench_renorm.py
function torch_top_k_renorm_probs (line 11) | def torch_top_k_renorm_probs(probs, top_k):
function torch_top_p_renorm_probs (line 42) | def torch_top_p_renorm_probs(probs, top_p, eps=1e-5):
function torch_top_k_mask_logits (line 81) | def torch_top_k_mask_logits(logits, top_k):
function calculate_diff_top_k_renorm (line 106) | def calculate_diff_top_k_renorm(batch_size, vocab_size, k):
function calculate_diff_top_p_renorm (line 122) | def calculate_diff_top_p_renorm(batch_size, vocab_size, p):
function calculate_diff_top_k_mask (line 138) | def calculate_diff_top_k_mask(batch_size, vocab_size, k):
function benchmark_top_k_renorm (line 181) | def benchmark_top_k_renorm(batch_size, vocab_size, k, provider):
function benchmark_top_p_renorm (line 214) | def benchmark_top_p_renorm(batch_size, vocab_size, p, provider):
function benchmark_top_k_mask (line 243) | def benchmark_top_k_mask(batch_size, vocab_size, k, provider):
FILE: python/sglang/jit_kernel/benchmark/bench_rmsnorm.py
function sglang_aot_rmsnorm (line 18) | def sglang_aot_rmsnorm(
function sglang_jit_rmsnorm (line 25) | def sglang_jit_rmsnorm(
function flashinfer_rmsnorm (line 32) | def flashinfer_rmsnorm(
function torch_impl_rmsnorm (line 40) | def torch_impl_rmsnorm(
function benchmark (line 79) | def benchmark(hidden_size: int, batch_size: int, provider: str):
FILE: python/sglang/jit_kernel/benchmark/bench_rope.py
function create_cos_sin_cache (line 20) | def create_cos_sin_cache(
function flashinfer_rope (line 48) | def flashinfer_rope(
function sglang_pos_enc_rope (line 67) | def sglang_pos_enc_rope(
function sglang_fused_rope (line 86) | def sglang_fused_rope(
function jit_rope_then_store (line 102) | def jit_rope_then_store(
function jit_fused_rope_store (line 134) | def jit_fused_rope_store(
function benchmark (line 198) | def benchmark(batch_size: int, num_q_k_heads: str, is_neox: bool, provid...
function benchmark_store (line 255) | def benchmark_store(batch_size: int, num_q_k_heads: str, is_neox: bool, ...
FILE: python/sglang/jit_kernel/benchmark/bench_store_cache.py
function sglang_jit_store_cache (line 17) | def sglang_jit_store_cache(
function torch_compile_store_cache (line 28) | def torch_compile_store_cache(
function torch_streams_store_cache (line 42) | def torch_streams_store_cache(
function benchmark (line 89) | def benchmark(
FILE: python/sglang/jit_kernel/benchmark/utils.py
function is_in_ci (line 16) | def is_in_ci() -> bool:
function get_benchmark_range (line 21) | def get_benchmark_range(full_range: List, ci_range: List) -> List:
function run_benchmark (line 26) | def run_benchmark(
function run_benchmark_no_cudagraph (line 43) | def run_benchmark_no_cudagraph(
FILE: python/sglang/jit_kernel/concat_mla.py
function _jit_concat_mla_k_module (line 14) | def _jit_concat_mla_k_module() -> Module:
function _jit_concat_mla_absorb_q_module (line 23) | def _jit_concat_mla_absorb_q_module() -> Module:
function concat_mla_k (line 31) | def concat_mla_k(k: torch.Tensor, k_nope: torch.Tensor, k_rope: torch.Te...
function concat_mla_absorb_q (line 47) | def concat_mla_absorb_q(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
FILE: python/sglang/jit_kernel/csrc/fast-hadamard-transform/code_gen.py
function string_to_array (line 161) | def string_to_array(string):
function array_code_gen (line 172) | def array_code_gen(arr):
function main (line 185) | def main():
FILE: python/sglang/jit_kernel/csrc/fast-hadamard-transform/fast_hadamard_transform.h
type HadamardParamsBase (line 11) | struct HadamardParamsBase {
FILE: python/sglang/jit_kernel/csrc/fast-hadamard-transform/fast_hadamard_transform_common.h
type uint8 (line 16) | struct uint8 {
type BytesToType (line 25) | struct BytesToType
type BytesToType (line 31) | struct BytesToType
type BytesToType (line 37) | struct BytesToType
type BytesToType (line 43) | struct BytesToType
type BytesToType (line 49) | struct BytesToType
type BytesToType (line 55) | struct BytesToType
function __device__ (line 64) | __device__ inline T operator()(T const& x, T const& y) {
function T (line 73) | inline T run(T x, Operator& op) {
type Allreduce (line 81) | struct Allreduce
function T (line 83) | inline T run(T x, Operator& op) {
function cilog2 (line 92) | constexpr int cilog2(int val) {
function hadamard_mult_thread (line 99) | void hadamard_mult_thread(float x[kNChunks][1 << kLogN]) {
function __device__ (line 141) | inline __device__ void load_input(input_t* x, float x_vals[kNChunks][kNE...
FILE: python/sglang/jit_kernel/csrc/fast-hadamard-transform/fast_hadamard_transform_special.h
function hadamard_mult_thread_12 (line 12) | void hadamard_mult_thread_12(float x[12]) {
function hadamard_mult_thread_20 (line 32) | void hadamard_mult_thread_20(float x[20]) {
function hadamard_mult_thread_28 (line 80) | void hadamard_mult_thread_28(float x[28]) {
function hadamard_mult_thread_40 (line 172) | void hadamard_mult_thread_40(float x[40]) {
FILE: python/sglang/jit_kernel/csrc/gemm/marlin/dequant.h
function namespace (line 68) | namespace device::marlin {
FILE: python/sglang/jit_kernel/csrc/gemm/marlin/kernel.h
function namespace (line 13) | namespace device::marlin {
FILE: python/sglang/jit_kernel/csrc/gemm/marlin/marlin_template.h
function namespace (line 32) | namespace device::marlin {
function __device__ (line 254) | __device__ inline void wait_negative_and_add(int* lock) {
function transform_a (line 612) | auto transform_a = [&](int i) {
function init_same_group (line 834) | auto init_same_group = [&](int pipe) {
function matmul (line 1053) | auto matmul = [&](int k) {
FILE: python/sglang/jit_kernel/csrc/gemm/marlin_moe/kernel.h
function namespace (line 16) | namespace device::marlin_moe {
FILE: python/sglang/jit_kernel/csrc/gemm/marlin_moe/marlin_template.h
function namespace (line 33) | namespace device::marlin_moe {
function __device__ (line 265) | __device__ inline void wait_negative_and_add(int* lock) {
function else (line 351) | else if constexpr (std::is_same<scalar_t, half>::value) {
function read_moe_block_data (line 444) | auto read_moe_block_data = [&](int block_id) {
function transform_a (line 756) | auto transform_a = [&](int i) {
function init_same_group (line 996) | auto init_same_group = [&](int pipe) {
function matmul (line 1218) | auto matmul = [&](int k) {
function write_result (line 1516) | auto write_result = [&](bool last) {
FILE: python/sglang/jit_kernel/cutedsl_gdn.py
function _define_kernels (line 33) | def _define_kernels():
function _create_jit_functions (line 951) | def _create_jit_functions():
function _get_jit_functions (line 1273) | def _get_jit_functions():
function _get_compiled_kernel (line 1280) | def _get_compiled_kernel(N, H, HV, K, V, pool_size, use_small_batch, is_...
function cutedsl_fused_sigmoid_gating_delta_rule_update (line 1373) | def cutedsl_fused_sigmoid_gating_delta_rule_update(
FILE: python/sglang/jit_kernel/diffusion/cutedsl/common/norm_fusion.py
function apply_norm_cta (line 15) | def apply_norm_cta(
function apply_rmsnorm_cta (line 32) | def apply_rmsnorm_cta(
function apply_layernorm_cta (line 61) | def apply_layernorm_cta(
function broadcast_tensor_for_bsfd (line 130) | def broadcast_tensor_for_bsfd(
function tensor_slice_for_bsfd (line 160) | def tensor_slice_for_bsfd(
FILE: python/sglang/jit_kernel/diffusion/cutedsl/common/reduce.py
function warp_reduce_sum (line 8) | def warp_reduce_sum(val: cute.Numeric, reduce_size: int = 32) -> cute.Nu...
function cta_reduce_sum (line 16) | def cta_reduce_sum(
FILE: python/sglang/jit_kernel/diffusion/cutedsl/scale_residual_norm_scale_shift.py
function to_cute_arg (line 18) | def to_cute_arg(
function to_fake_cute_args (line 42) | def to_fake_cute_args(t: torch.Tensor):
class ScaleResidualNormScaleShift (line 57) | class ScaleResidualNormScaleShift:
method make_hash_key (line 59) | def make_hash_key(cls, *inputs):
method __init__ (line 81) | def __init__(self, D: int, norm_type: str):
method __call__ (line 88) | def __call__(
method kernel (line 135) | def kernel(
function validate_x (line 228) | def validate_x(t: torch.Tensor, B: int, S: int, D: int):
function validate_weight_bias (line 237) | def validate_weight_bias(t: Optional[torch.Tensor], B: int, S: int, D: i...
function validate_scale_shift (line 248) | def validate_scale_shift(t: torch.Tensor, B: int, S: int, D: int):
function validate_gate (line 271) | def validate_gate(t: Union[torch.Tensor, int], B: int, S: int, D: int):
function fused_norm_scale_shift (line 278) | def fused_norm_scale_shift(
function _fused_norm_scale_shift_fake (line 344) | def _fused_norm_scale_shift_fake(x, weight, bias, scale, shift, norm_typ...
function fused_scale_residual_norm_scale_shift (line 352) | def fused_scale_residual_norm_scale_shift(
function _fused_scale_residual_norm_scale_shift_fake (line 426) | def _fused_scale_residual_norm_scale_shift_fake(
FILE: python/sglang/jit_kernel/diffusion/triton/mps_fallback.py
function _torch_to_mlx (line 45) | def _torch_to_mlx(tensor: torch.Tensor) -> "mx.array":
function _mlx_to_torch (line 53) | def _mlx_to_torch(array: "mx.array", device: torch.device) -> torch.Tensor:
function fuse_scale_shift_kernel_native (line 64) | def fuse_scale_shift_kernel_native(
function fuse_scale_shift_gate_select01_kernel_native (line 97) | def fuse_scale_shift_gate_select01_kernel_native(
function apply_rotary_embedding_native (line 118) | def apply_rotary_embedding_native(
function norm_infer_native (line 131) | def norm_infer_native(
function triton_one_pass_rms_norm_native (line 160) | def triton_one_pass_rms_norm_native(
function rms_norm_fn_native (line 172) | def rms_norm_fn_native(
function norm_infer_native (line 223) | def norm_infer_native( # noqa: F811
function triton_one_pass_rms_norm_native (line 250) | def triton_one_pass_rms_norm_native( # noqa: F811
function rms_norm_fn_native (line 262) | def rms_norm_fn_native( # noqa: F811
FILE: python/sglang/jit_kernel/diffusion/triton/norm.py
function maybe_contiguous_lastdim (line 10) | def maybe_contiguous_lastdim(x):
function maybe_contiguous (line 14) | def maybe_contiguous(x):
function triton_autotune_configs (line 18) | def triton_autotune_configs():
function _layer_norm_fwd_1pass_kernel (line 64) | def _layer_norm_fwd_1pass_kernel(
function _layer_norm_fwd (line 188) | def _layer_norm_fwd(
function _layer_norm_fwd_impl (line 251) | def _layer_norm_fwd_impl(
class LayerNormFn (line 373) | class LayerNormFn:
method forward (line 376) | def forward(
function layer_norm_fn (line 453) | def layer_norm_fn(
function _norm_infer_kernel (line 496) | def _norm_infer_kernel(
function norm_infer (line 540) | def norm_infer(
function rms_norm_fn (line 582) | def rms_norm_fn(
FILE: python/sglang/jit_kernel/diffusion/triton/npu_fallback.py
function fuse_scale_shift_native (line 5) | def fuse_scale_shift_native(
function apply_rotary_embedding_native (line 16) | def apply_rotary_embedding_native(
FILE: python/sglang/jit_kernel/diffusion/triton/rmsnorm_onepass.py
function _rms_norm_tiled_onepass (line 8) | def _rms_norm_tiled_onepass(
function triton_one_pass_rms_norm (line 36) | def triton_one_pass_rms_norm(x: torch.Tensor, w: torch.Tensor, eps: floa...
FILE: python/sglang/jit_kernel/diffusion/triton/rotary.py
function _rotary_embedding_kernel (line 18) | def _rotary_embedding_kernel(
function apply_rotary_embedding (line 67) | def apply_rotary_embedding(
FILE: python/sglang/jit_kernel/diffusion/triton/scale_shift.py
function _fused_layernorm_scale_shift_gate_select01_kernel (line 9) | def _fused_layernorm_scale_shift_gate_select01_kernel(
function _fused_residual_layernorm_scale_shift_gate_select01_kernel (line 116) | def _fused_residual_layernorm_scale_shift_gate_select01_kernel(
function _fused_scale_shift_4d_kernel (line 247) | def _fused_scale_shift_4d_kernel(
function fuse_scale_shift_kernel_blc_opt (line 292) | def fuse_scale_shift_kernel_blc_opt(
function fuse_scale_shift_gate_select01_kernel_blc_opt (line 360) | def fuse_scale_shift_gate_select01_kernel_blc_opt(
function fuse_scale_shift_kernel (line 447) | def fuse_scale_shift_kernel(
function fuse_scale_shift_gate_select01_kernel (line 566) | def fuse_scale_shift_gate_select01_kernel(
function fuse_layernorm_scale_shift_gate_select01_kernel (line 638) | def fuse_layernorm_scale_shift_gate_select01_kernel(
function fuse_residual_layernorm_scale_shift_gate_select01_kernel (line 727) | def fuse_residual_layernorm_scale_shift_gate_select01_kernel(
FILE: python/sglang/jit_kernel/flash_attention_v4.py
function _maybe_contiguous (line 16) | def _maybe_contiguous(x: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
function flash_attn_varlen_func (line 20) | def flash_attn_varlen_func(
function flash_attn_with_kvcache (line 92) | def flash_attn_with_kvcache(
FILE: python/sglang/jit_kernel/fused_metadata_copy.py
function _jit_fused_metadata_copy_module (line 29) | def _jit_fused_metadata_copy_module(
function _jit_fused_metadata_copy_multi_module (line 62) | def _jit_fused_metadata_copy_multi_module(
function fused_metadata_copy_cuda (line 97) | def fused_metadata_copy_cuda(
function fused_metadata_copy_multi_cuda (line 199) | def fused_metadata_copy_multi_cuda(
FILE: python/sglang/jit_kernel/fused_store_index_cache.py
function _jit_nsa_fused_store_module (line 30) | def _jit_nsa_fused_store_module(
function can_use_nsa_fused_store (line 55) | def can_use_nsa_fused_store(
function fused_store_index_k_cache (line 67) | def fused_store_index_k_cache(
FILE: python/sglang/jit_kernel/gptq_marlin.py
function _jit_gptq_marlin_module (line 18) | def _jit_gptq_marlin_module(dtype: torch.dtype) -> Module:
function _or_empty (line 28) | def _or_empty(
function gptq_marlin_gemm (line 34) | def gptq_marlin_gemm(
FILE: python/sglang/jit_kernel/gptq_marlin_repack.py
function _jit_gptq_marlin_repack_module (line 17) | def _jit_gptq_marlin_repack_module() -> Module:
function gptq_marlin_repack (line 25) | def gptq_marlin_repack(
FILE: python/sglang/jit_kernel/hadamard.py
function _jit_hadamard_module (line 14) | def _jit_hadamard_module(dtype: torch.dtype) -> Module:
function _hadamard_transform_impl (line 32) | def _hadamard_transform_impl(
function hadamard_transform (line 59) | def hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Ten...
function hadamard_transform_12n (line 64) | def hadamard_transform_12n(x: torch.Tensor, scale: float = 1.0) -> torch...
function hadamard_transform_20n (line 69) | def hadamard_transform_20n(x: torch.Tensor, scale: float = 1.0) -> torch...
function hadamard_transform_28n (line 74) | def hadamard_transform_28n(x: torch.Tensor, scale: float = 1.0) -> torch...
function hadamard_transform_40n (line 79) | def hadamard_transform_40n(x: torch.Tensor, scale: float = 1.0) -> torch...
FILE: python/sglang/jit_kernel/hicache.py
function _jit_hicache_module (line 16) | def _jit_hicache_module(*, element_size: int, unroll: int, block_quota: ...
function can_use_hicache_jit_kernel (line 34) | def can_use_hicache_jit_kernel(
function _default_unroll (line 58) | def _default_unroll(element_size: int) -> int:
function transfer_hicache_one_layer (line 69) | def transfer_hicache_one_layer(
function transfer_hicache_all_layer (line 104) | def transfer_hicache_all_layer(
FILE: python/sglang/jit_kernel/include/sgl_kernel/scalar_type.hpp
type host (line 9) | namespace host {
class ScalarType (line 20) | class ScalarType {
type NanRepr (line 22) | enum NanRepr : uint8_t {
method ScalarType (line 30) | constexpr ScalarType(
method ScalarType (line 44) | static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) {
method ScalarType (line 48) | static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) {
method ScalarType (line 53) | static constexpr ScalarType float_IEEE754(uint8_t exponent, uint8_t ...
method ScalarType (line 59) | static constexpr ScalarType float_(uint8_t exponent, uint8_t mantiss...
method member_id_field_width (line 84) | static constexpr size_t member_id_field_width() {
method reduce_members_helper (line 90) | static constexpr auto reduce_members_helper(Fn f, Init val, Member m...
method reduce_members (line 100) | constexpr auto reduce_members(Fn f, Init init) const {
method reduce_member_types (line 106) | static constexpr auto reduce_member_types(Fn f, Init init) {
method id_size_bits (line 111) | static constexpr auto id_size_bits() {
method Id (line 120) | constexpr Id id() const {
method ScalarType (line 134) | static constexpr ScalarType from_id(Id id) {
method size_bits (line 148) | constexpr int64_t size_bits() const {
method is_signed (line 151) | constexpr bool is_signed() const {
method is_integer (line 154) | constexpr bool is_integer() const {
method is_floating_point (line 157) | constexpr bool is_floating_point() const {
method is_ieee_754 (line 160) | constexpr bool is_ieee_754() const {
method has_nans (line 163) | constexpr bool has_nans() const {
method has_infs (line 166) | constexpr bool has_infs() const {
method has_bias (line 169) | constexpr bool has_bias() const {
method _floating_point_max (line 175) | double _floating_point_max() const {
method _raw_max (line 208) | constexpr std::variant<int64_t, double> _raw_max() const {
method _raw_min (line 217) | constexpr std::variant<int64_t, double> _raw_min() const {
method max (line 242) | constexpr std::variant<int64_t, double> max() const {
method min (line 248) | constexpr std::variant<int64_t, double> min() const {
method str (line 254) | std::string str() const {
FILE: python/sglang/jit_kernel/include/sgl_kernel/source_location.h
type source_location_fallback (line 18) | struct source_location_fallback {
FILE: python/sglang/jit_kernel/include/sgl_kernel/tensor.h
function namespace (line 38) | namespace host {
type PrintableDevice (line 129) | struct PrintableDevice {
type SymbolicDType (line 253) | struct SymbolicDType {
function DLDataType (line 274) | auto unwrap(DebugInfo info = {}) const -> DLDataType {
type SymbolicDevice (line 316) | struct SymbolicDevice {
function DLDevice (line 341) | auto unwrap(DebugInfo info = {}) const -> DLDevice {
function namespace (line 382) | namespace details {
function SymbolicDevice (line 432) | struct DeviceRef : BaseRef<SymbolicDevice> {
FILE: python/sglang/jit_kernel/include/sgl_kernel/utils.h
function namespace (line 57) | namespace host {
FILE: python/sglang/jit_kernel/kvcache.py
function _jit_kvcache_module (line 20) | def _jit_kvcache_module(row_bytes: int) -> Module:
function can_use_store_cache (line 31) | def can_use_store_cache(size: int) -> bool:
function store_cache (line 49) | def store_cache(
FILE: python/sglang/jit_kernel/moe_lora_align.py
function _jit_moe_align_module (line 14) | def _jit_moe_align_module(dtype: torch.dtype) -> Module:
function moe_lora_align_block_size (line 26) | def moe_lora_align_block_size(
FILE: python/sglang/jit_kernel/moe_wna16_marlin.py
function _jit_moe_wna16_marlin_module (line 18) | def _jit_moe_wna16_marlin_module(dtype: torch.dtype) -> Module:
function _or_empty (line 33) | def _or_empty(
function moe_wna16_marlin_gemm (line 39) | def moe_wna16_marlin_gemm(
FILE: python/sglang/jit_kernel/ngram_embedding.py
function _jit_ngram_embedding_module (line 13) | def _jit_ngram_embedding_module() -> Module:
function compute_n_gram_ids (line 24) | def compute_n_gram_ids(
function update_token_table (line 69) | def update_token_table(
FILE: python/sglang/jit_kernel/norm.py
function _jit_qknorm_module (line 20) | def _jit_qknorm_module(head_dim: int, dtype: torch.dtype) -> Module:
function _jit_rmsnorm_module (line 31) | def _jit_rmsnorm_module(hidden_size: int, dtype: torch.dtype) -> Module:
function _jit_fused_add_rmsnorm_module (line 42) | def _jit_fused_add_rmsnorm_module(dtype: torch.dtype) -> Module:
function _jit_qknorm_across_heads_module (line 53) | def _jit_qknorm_across_heads_module(dtype: torch.dtype) -> Module:
function can_use_fused_inplace_qknorm (line 66) | def can_use_fused_inplace_qknorm(head_dim: int, dtype: torch.dtype) -> b...
function fused_inplace_qknorm (line 79) | def fused_inplace_qknorm(
function rmsnorm (line 93) | def rmsnorm(
function fused_add_rmsnorm (line 105) | def fused_add_rmsnorm(
function fused_inplace_qknorm_across_heads (line 115) | def fused_inplace_qknorm_across_heads(
FILE: python/sglang/jit_kernel/nvfp4.py
function _find_package_root (line 22) | def _find_package_root(package: str) -> Optional[pathlib.Path]:
function _resolve_cutlass_include_paths (line 29) | def _resolve_cutlass_include_paths() -> list[str]:
function _nvfp4_cuda_flags (line 59) | def _nvfp4_cuda_flags() -> list[str]:
function _get_nvfp4_cuda_arch_list (line 73) | def _get_nvfp4_cuda_arch_list() -> str:
function _nvfp4_arch_env (line 90) | def _nvfp4_arch_env():
function _jit_nvfp4_quant_module (line 104) | def _jit_nvfp4_quant_module() -> Module:
function _jit_nvfp4_expert_quant_module (line 127) | def _jit_nvfp4_expert_quant_module() -> Module:
function _jit_nvfp4_scaled_mm_module (line 154) | def _jit_nvfp4_scaled_mm_module() -> Module:
function _jit_nvfp4_blockwise_moe_module (line 176) | def _jit_nvfp4_blockwise_moe_module() -> Module:
function cutlass_scaled_fp4_mm (line 198) | def cutlass_scaled_fp4_mm(
function cutlass_fp4_group_mm (line 214) | def cutlass_fp4_group_mm(
function _scaled_fp4_quant_custom_op (line 283) | def _scaled_fp4_quant_custom_op(
function scaled_fp4_quant (line 293) | def scaled_fp4_quant(
function _shuffle_rows_torch (line 329) | def _shuffle_rows_torch(
function _scaled_fp4_experts_quant_custom_op (line 343) | def _scaled_fp4_experts_quant_custom_op(
function scaled_fp4_experts_quant (line 362) | def scaled_fp4_experts_quant(
function _scaled_fp4_grouped_quant_custom_op (line 426) | def _scaled_fp4_grouped_quant_custom_op(
function scaled_fp4_grouped_quant (line 446) | def scaled_fp4_grouped_quant(
function _silu_and_mul_scaled_fp4_grouped_quant_custom_op (line 486) | def _silu_and_mul_scaled_fp4_grouped_quant_custom_op(
function silu_and_mul_scaled_fp4_grouped_quant (line 506) | def silu_and_mul_scaled_fp4_grouped_quant(
function _cutlass_fp4_group_mm_custom_op (line 557) | def _cutlass_fp4_group_mm_custom_op(
function suggest_nvfp4_global_scale (line 602) | def suggest_nvfp4_global_scale(x: torch.Tensor) -> torch.Tensor:
FILE: python/sglang/jit_kernel/per_tensor_quant_fp8.py
function _jit_per_tensor_quant_fp8_module (line 15) | def _jit_per_tensor_quant_fp8_module(is_static: bool, dtype: torch.dtype...
function per_tensor_quant_fp8 (line 29) | def per_tensor_quant_fp8(
FILE: python/sglang/jit_kernel/per_token_group_quant_8bit.py
function _jit_per_token_group_quant_8bit_module (line 17) | def _jit_per_token_group_quant_8bit_module(
function _per_token_group_quant_8bit_custom_op (line 38) | def _per_token_group_quant_8bit_custom_op(
function per_token_group_quant_8bit (line 75) | def per_token_group_quant_8bit(
FILE: python/sglang/jit_kernel/rope.py
function _jit_rotary_embedding_module (line 21) | def _jit_rotary_embedding_module() -> Module:
function _jit_fused_rope_module (line 30) | def _jit_fused_rope_module(is_neox: bool, rope_dim: int, dtype: torch.dt...
function rotary_embedding_with_key (line 47) | def rotary_embedding_with_key(
function rotary_embedding_without_key (line 63) | def rotary_embedding_without_key(
function rotary_embedding (line 74) | def rotary_embedding(
class FusedSetKVBufferArg (line 94) | class FusedSetKVBufferArg:
function apply_rope_inplace (line 113) | def apply_rope_inplace(
function apply_rope_inplace_with_kvcache (line 141) | def apply_rope_inplace_with_kvcache(
function apply_rope_with_cos_sin_cache_inplace (line 179) | def apply_rope_with_cos_sin_cache_inplace(
FILE: python/sglang/jit_kernel/tests/test_add_constant.py
function test_add_constant (line 9) | def test_add_constant(size: int, constant: int) -> None:
FILE: python/sglang/jit_kernel/tests/test_awq_dequantize.py
function reverse_awq_order (line 16) | def reverse_awq_order(t: torch.Tensor):
function awq_dequantize_torch (line 35) | def awq_dequantize_torch(
function test_awq_dequantize_jit_vs_torch (line 76) | def test_awq_dequantize_jit_vs_torch(
function test_awq_dequantize_jit_vs_aot (line 124) | def test_awq_dequantize_jit_vs_aot(
FILE: python/sglang/jit_kernel/tests/test_awq_marlin_moe_repack.py
function _has_aot_awq_marlin_moe_repack (line 12) | def _has_aot_awq_marlin_moe_repack() -> bool:
function awq_pack (line 21) | def awq_pack(
function test_awq_marlin_moe_repack_jit_vs_aot (line 46) | def test_awq_marlin_moe_repack_jit_vs_aot(
function test_awq_marlin_moe_repack_shape (line 87) | def test_awq_marlin_moe_repack_shape(
FILE: python/sglang/jit_kernel/tests/test_awq_marlin_repack.py
function _has_aot_awq_marlin_repack (line 13) | def _has_aot_awq_marlin_repack() -> bool:
function awq_pack (line 22) | def awq_pack(
function test_awq_marlin_repack_jit_vs_aot (line 46) | def test_awq_marlin_repack_jit_vs_aot(num_bits, k_tiles, n_tiles, group_...
function test_awq_marlin_repack_correct (line 76) | def test_awq_marlin_repack_correct(num_bits, k_tiles, n_tiles, group_size):
FILE: python/sglang/jit_kernel/tests/test_concat_mla.py
function torch_concat_mla_k (line 8) | def torch_concat_mla_k(
function torch_concat_mla_absorb_q (line 21) | def torch_concat_mla_absorb_q(
function sgl_kernel_concat_mla_k (line 33) | def sgl_kernel_concat_mla_k(
function sgl_kernel_concat_mla_absorb_q (line 42) | def sgl_kernel_concat_mla_absorb_q(
function jit_concat_mla_k (line 52) | def jit_concat_mla_k(
function jit_concat_mla_absorb_q (line 61) | def jit_concat_mla_absorb_q(
function test_concat_mla_k_jit_vs_torch (line 89) | def test_concat_mla_k_jit_vs_torch(num_tokens: int) -> None:
function test_concat_mla_k_jit_vs_aot (line 110) | def test_concat_mla_k_jit_vs_aot(num_tokens: int) -> None:
function test_concat_mla_absorb_q_jit_vs_torch (line 138) | def test_concat_mla_absorb_q_jit_vs_torch(dim_0: int, dim_1: int) -> None:
function test_concat_mla_absorb_q_jit_vs_aot (line 155) | def test_concat_mla_absorb_q_jit_vs_aot(dim_0: int, dim_1: int) -> None:
FILE: python/sglang/jit_kernel/tests/test_cutedsl_gdn.py
function run_triton_kernel (line 29) | def run_triton_kernel(A_log, dt_bias, q, k, v, a, b, initial_state, indi...
function test_cutedsl_gdn_precision (line 57) | def test_cutedsl_gdn_precision(B: int):
function test_cutedsl_gdn_performance (line 114) | def test_cutedsl_gdn_performance(B: int):
FILE: python/sglang/jit_kernel/tests/test_flash_attention_4.py
function apply_rotary_emb (line 19) | def apply_rotary_emb(
function unpad_input (line 81) | def unpad_input(hidden_states, attention_mask, unused_mask=None):
function pad_input (line 115) | def pad_input(hidden_states, indices, batch, seqlen):
function generate_random_padding_mask (line 133) | def generate_random_padding_mask(
function generate_qkv (line 171) | def generate_qkv(
function construct_local_mask (line 322) | def construct_local_mask(
function construct_chunk_mask (line 363) | def construct_chunk_mask(
function attention_ref (line 399) | def attention_ref(
function test_flash_attn_varlen_output (line 604) | def test_flash_attn_varlen_output(
function test_flash_attn_kvcache (line 997) | def test_flash_attn_kvcache(
function _generate_block_kvcache (line 1471) | def _generate_block_kvcache(
FILE: python/sglang/jit_kernel/tests/test_fused_add_rmsnorm.py
function sglang_jit_fused_add_rmsnorm (line 9) | def sglang_jit_fused_add_rmsnorm(
function flashinfer_fused_add_rmsnorm (line 17) | def flashinfer_fused_add_rmsnorm(
function test_fused_add_rmsnorm (line 39) | def test_fused_add_rmsnorm(batch_size: int, hidden_size: int) -> None:
FILE: python/sglang/jit_kernel/tests/test_fused_metadata_copy.py
function create_test_metadata (line 21) | def create_test_metadata(
function reference_copy_decode (line 125) | def reference_copy_decode(src, dst, max_len):
function reference_copy_target_verify (line 148) | def reference_copy_target_verify(src, dst, max_seqlen_k, seqlens_expande...
function reference_copy_draft_extend (line 176) | def reference_copy_draft_extend(src, dst, max_seqlen_k, seqlens_expanded...
function test_fused_metadata_copy_dtype_validation (line 209) | def test_fused_metadata_copy_dtype_validation():
function test_fused_metadata_copy (line 322) | def test_fused_metadata_copy(bs, forward_mode, has_real_page_table, has_...
function test_fused_metadata_copy_large_batch (line 423) | def test_fused_metadata_copy_large_batch(bs):
function create_test_metadata_multi (line 488) | def create_test_metadata_multi(
function reference_copy_for_loop (line 594) | def reference_copy_for_loop(src, dst_list, bs, max_len):
function test_fused_metadata_copy_multi_dtype_validation (line 618) | def test_fused_metadata_copy_multi_dtype_validation():
function test_fused_metadata_copy_multi (line 713) | def test_fused_metadata_copy_multi(bs, has_real_page_table, has_flashmla):
function test_fused_metadata_copy_multi_large_batch (line 915) | def test_fused_metadata_copy_multi_large_batch(bs):
FILE: python/sglang/jit_kernel/tests/test_fused_norm_scale_shift.py
function _tol (line 39) | def _tol(dtype: torch.dtype):
function cuda_setup (line 44) | def cuda_setup():
function _apply_scale_shift (line 50) | def _apply_scale_shift(y: Tensor, scale: Tensor, shift: Tensor) -> Tensor:
function fused_norm_scale_shift_ref (line 63) | def fused_norm_scale_shift_ref(
function fused_scale_residual_norm_scale_shift_ref (line 83) | def fused_scale_residual_norm_scale_shift_ref(
function _make_tensor (line 117) | def _make_tensor(index_mode: str, shape: Tuple, dtype: torch.dtype):
function run_norm_scale_shift (line 124) | def run_norm_scale_shift(
function run_scale_resi_norm_scale_shift (line 147) | def run_scale_resi_norm_scale_shift(
class TestFusedNormScaleShift (line 178) | class TestFusedNormScaleShift:
method test_shape_dtype (line 181) | def test_shape_dtype(self, shape, dtype, norm_type):
method test_dtype_0 (line 185) | def test_dtype_0(self, dtype, norm_type):
method test_dtype_1 (line 189) | def test_dtype_1(self, dtype, norm_type):
method test_normtype_affine (line 193) | def test_normtype_affine(self, affine_mode, norm_type):
method test_index_mode (line 197) | def test_index_mode(self, index_mode, norm_type):
class TestFusedScaleResidualNormScaleShift (line 204) | class TestFusedScaleResidualNormScaleShift:
method test_shape_dtype (line 207) | def test_shape_dtype(self, shape, dtype, norm_type):
method test_dtype_0 (line 211) | def test_dtype_0(self, dtype, norm_type):
method test_dtype_1 (line 215) | def test_dtype_1(self, dtype, norm_type):
method test_normtype_affine (line 221) | def test_normtype_affine(self, affine_mode, norm_type):
method test_scale_shift_index_mode (line 225) | def test_scale_shift_index_mode(self, index_mode, norm_type):
method test_gate_index_mode (line 231) | def test_gate_index_mode(self, index_mode, norm_type):
FILE: python/sglang/jit_kernel/tests/test_fused_store_index_cache.py
function _skip_if_unavailable (line 56) | def _skip_if_unavailable(page_size: int = PAGE_SIZE):
function _num_pages (line 71) | def _num_pages(loc: torch.Tensor, page_size: int, extra: int = 1) -> int:
function _make_buffer (line 75) | def _make_buffer(num_pages: int, page_size: int = PAGE_SIZE) -> torch.Te...
function _read_token_from_buffer (line 83) | def _read_token_from_buffer(
function _write_token_to_buffer (line 109) | def _write_token_to_buffer(
function _gather_tokens (line 135) | def _gather_tokens(
function _reference_quantize_and_store (line 152) | def _reference_quantize_and_store(
function _import_act_quant (line 182) | def _import_act_quant():
function _ref_store_via_act_quant (line 191) | def _ref_store_via_act_quant(
function test_fused_kernel_matches_own_algorithm (line 241) | def test_fused_kernel_matches_own_algorithm(num_tokens: int, base_index:...
function test_fused_kernel_vs_act_quant_semantic (line 298) | def test_fused_kernel_vs_act_quant_semantic(scale_fmt: Optional[str]):
function test_roundtrip_reconstruction (line 368) | def test_roundtrip_reconstruction(num_tokens: int):
function test_single_token (line 395) | def test_single_token():
function test_zero_input (line 412) | def test_zero_input():
function test_reference_writes_nonzero (line 436) | def test_reference_writes_nonzero():
FILE: python/sglang/jit_kernel/tests/test_fused_verify_triton_gdn.py
function _make_tensors (line 26) | def _make_tensors(N, T, H, HV, K, V, device="cuda", seed=2025):
function run_reference (line 42) | def run_reference(
function run_fused_mtp (line 91) | def run_fused_mtp(
function test_fused_gdn_mtp_precision (line 135) | def test_fused_gdn_mtp_precision(N: int, T: int):
function test_mtp_single_step_decode (line 178) | def test_mtp_single_step_decode(N: int):
FILE: python/sglang/jit_kernel/tests/test_gptq_marlin.py
function test_gptq_marlin_gemm (line 23) | def test_gptq_marlin_gemm(
FILE: python/sglang/jit_kernel/tests/test_gptq_marlin_repack.py
function test_gptq_marlin_repack (line 34) | def test_gptq_marlin_repack(
FILE: python/sglang/jit_kernel/tests/test_hadamard_jit.py
function _parse_hadamard_str (line 135) | def _parse_hadamard_str(s):
function hadamard_transform_ref (line 154) | def hadamard_transform_ref(x, scale=1.0):
function hadamard_transform_mn_ref (line 173) | def hadamard_transform_mn_ref(x, multiple, scale=1.0):
function test_hadamard_transform (line 223) | def test_hadamard_transform(dim, dtype):
function test_hadamard_transform_non_power_of_two (line 254) | def test_hadamard_transform_non_power_of_two(dim, dtype):
function test_hadamard_transform_3d_input (line 277) | def test_hadamard_transform_3d_input(dtype):
function test_hadamard_transform_scale_one (line 298) | def test_hadamard_transform_scale_one(dtype):
function test_hadamard_transform_12n (line 328) | def test_hadamard_transform_12n(dim, dtype):
function test_hadamard_transform_20n (line 352) | def test_hadamard_transform_20n(dim, dtype):
function test_hadamard_transform_28n (line 376) | def test_hadamard_transform_28n(dim, dtype):
function test_hadamard_transform_40n (line 400) | def test_hadamard_transform_40n(dim, dtype):
FILE: python/sglang/jit_kernel/tests/test_moe_lora_align_block_size.py
function round_up (line 16) | def round_up(x, base):
function CEILDIV (line 20) | def CEILDIV(x, y):
function sample_data (line 24) | def sample_data(num_experts, max_loras, num_tokens, topk_num):
function test_moe_lora_align_block_size (line 66) | def test_moe_lora_align_block_size(
FILE: python/sglang/jit_kernel/tests/test_moe_wna16_marlin.py
function _has_aot_moe_wna16_marlin_gemm (line 12) | def _has_aot_moe_wna16_marlin_gemm() -> bool:
function stack_and_dev (line 21) | def stack_and_dev(tensors: list[torch.Tensor]):
function _get_scalar_type (line 26) | def _get_scalar_type(num_bits: int, has_zp: bool):
function _setup_moe_weights (line 34) | def _setup_moe_weights(e, n, k, quant_type, group_size, act_order, dtype):
function _run_single_gemm (line 77) | def _run_single_gemm(
function _run_single_gemm_aot (line 131) | def _run_single_gemm_aot(
function generate_test_cases (line 184) | def generate_test_cases():
function test_moe_wna16_marlin_gemm (line 232) | def test_moe_wna16_marlin_gemm(
FILE: python/sglang/jit_kernel/tests/test_norm_jit.py
function _jit_rmsnorm (line 15) | def _jit_rmsnorm(input, weight, output, eps):
function _fi_rmsnorm (line 21) | def _fi_rmsnorm(input, weight, out, eps):
function _jit_fused_add_rmsnorm (line 27) | def _jit_fused_add_rmsnorm(input, residual, weight, eps):
function _fi_fused_add_rmsnorm (line 33) | def _fi_fused_add_rmsnorm(input, residual, weight, eps):
function test_rmsnorm_jit (line 43) | def test_rmsnorm_jit(batch_size, hidden_size, dtype, specify_out):
function test_fused_add_rmsnorm_jit (line 65) | def test_fused_add_rmsnorm_jit(batch_size, hidden_size, dtype):
FILE: python/sglang/jit_kernel/tests/test_nvfp4_blockwise_moe.py
function _nvfp4_supported (line 14) | def _nvfp4_supported() -> bool:
function _round_up (line 18) | def _round_up(x: int, y: int) -> int:
function _build_expert_offsets (line 22) | def _build_expert_offsets(
function _build_blockscale_offsets (line 31) | def _build_blockscale_offsets(
function test_nvfp4_blockwise_moe_grouped_mm (line 44) | def test_nvfp4_blockwise_moe_grouped_mm(dtype: torch.dtype) -> None:
FILE: python/sglang/jit_kernel/tests/test_nvfp4_gemm.py
function _nvfp4_supported (line 7) | def _nvfp4_supported() -> bool:
function e2m1_to_fp32 (line 35) | def e2m1_to_fp32(int4_value: int) -> float:
function break_fp4_bytes (line 42) | def break_fp4_bytes(a: torch.Tensor) -> torch.Tensor:
function convert_swizzled_to_linear (line 53) | def convert_swizzled_to_linear(
function dequantize_to_dtype (line 67) | def dequantize_to_dtype(
function get_ref_results (line 84) | def get_ref_results(
function test_nvfp4_gemm (line 103) | def test_nvfp4_gemm(dtype: torch.dtype, shape: tuple[int, int, int]) -> ...
FILE: python/sglang/jit_kernel/tests/test_nvfp4_quant.py
function _nvfp4_supported (line 16) | def _nvfp4_supported() -> bool:
function _silu_and_mul_reference (line 20) | def _silu_and_mul_reference(x: torch.Tensor) -> torch.Tensor:
function cast_from_fp4 (line 60) | def cast_from_fp4(x: torch.Tensor, m: int, n: int) -> torch.Tensor:
function cast_to_fp4 (line 68) | def cast_to_fp4(x: torch.Tensor) -> torch.Tensor:
function get_reciprocal (line 82) | def get_reciprocal(x):
function ref_nvfp4_quant (line 88) | def ref_nvfp4_quant(x: torch.Tensor, global_scale: torch.Tensor):
function recover_swizzled_scales (line 103) | def recover_swizzled_scales(scale: torch.Tensor, m: int, n: int) -> torc...
function test_quantize_to_fp4 (line 118) | def test_quantize_to_fp4(dtype: torch.dtype, shape: tuple[int, int]) -> ...
function test_quantize_to_fp4_padded (line 139) | def test_quantize_to_fp4_padded(shape: tuple[int, int]) -> None:
function test_quantize_to_fp4_grouped (line 160) | def test_quantize_to_fp4_grouped(shape: tuple[int, int, int]) -> None:
function test_silu_and_mul_quantize_to_fp4_grouped (line 186) | def test_silu_and_mul_quantize_to_fp4_grouped(shape: tuple[int, int, int...
FILE: python/sglang/jit_kernel/tests/test_per_tensor_quant_fp8.py
function sglang_scaled_fp8_quant (line 19) | def sglang_scaled_fp8_quant(
function torch_scaled_fp8_quant (line 34) | def torch_scaled_fp8_quant(tensor, inv_scale):
function test_jit_per_tensor_quant_compare_implementations (line 46) | def test_jit_per_tensor_quant_compare_implementations(
function test_jit_per_tensor_quant_supports_3d (line 62) | def test_jit_per_tensor_quant_supports_3d(shape):
FILE: python/sglang/jit_kernel/tests/test_per_token_group_quant_8bit.py
function test_per_token_group_quant_with_column_major (line 108) | def test_per_token_group_quant_with_column_major(
FILE: python/sglang/jit_kernel/tests/test_pos_enc.py
function burn_kernel (line 13) | def burn_kernel(out_ptr, iters: tl.constexpr):
function triton_burn (line 29) | def triton_burn(ms: float, grid=(256,)):
function create_test_inputs (line 36) | def create_test_inputs(
function create_cos_sin_cache (line 59) | def create_cos_sin_cache(rotary_dim, max_position_embeddings, base, dtyp...
function _apply_rotary_emb (line 86) | def _apply_rotary_emb(
class RotaryEmbedding (line 115) | class RotaryEmbedding(torch.nn.Module):
method __init__ (line 117) | def __init__(
method _compute_inv_freq (line 138) | def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
method _compute_cos_sin_cache (line 147) | def _compute_cos_sin_cache(self) -> torch.Tensor:
method forward_native (line 158) | def forward_native(
function get_torch_rotary_embedding (line 199) | def get_torch_rotary_embedding(
function get_sgl_rotary_embedding (line 213) | def get_sgl_rotary_embedding(
function compare_results (line 234) | def compare_results(jit_out, sgl_out, dtype):
function test_correctness (line 284) | def test_correctness(
function test_performance (line 373) | def test_performance(
FILE: python/sglang/jit_kernel/tests/test_qknorm.py
function sglang_aot_qknorm (line 10) | def sglang_aot_qknorm(
function sglang_jit_qknorm (line 25) | def sglang_jit_qknorm(
function flashinfer_qknorm (line 36) | def flashinfer_qknorm(
function torch_impl_qknorm (line 49) | def torch_impl_qknorm(
function test_qknorm (line 82) | def test_qknorm(batch_size: int, n_k: int, n_q: int, head_dim: int) -> N...
FILE: python/sglang/jit_kernel/tests/test_qknorm_across_heads.py
function sglang_jit_qknorm_across_heads (line 10) | def sglang_jit_qknorm_across_heads(
function sglang_aot_qknorm_across_heads (line 21) | def sglang_aot_qknorm_across_heads(
function torch_impl_qknorm_across_heads (line 34) | def torch_impl_qknorm_across_heads(
function test_qknorm_across_heads (line 61) | def test_qknorm_across_heads(batch_size: int, hidden_dim: int) -> None:
FILE: python/sglang/jit_kernel/tests/test_qwen_image_modulation.py
function _tol (line 23) | def _tol(dtype: torch.dtype) -> tuple[float, float]:
function _make_modulation_tensors (line 29) | def _make_modulation_tensors(batch_size: int, hidden_size: int, dtype: t...
function _baseline_select01_modulation (line 39) | def _baseline_select01_modulation(
function _baseline_residual_select01_modulation (line 72) | def _baseline_residual_select01_modulation(
function cuda_setup (line 109) | def cuda_setup():
function test_fused_layernorm_scale_shift_gate_select01 (line 119) | def test_fused_layernorm_scale_shift_gate_select01(
function test_fused_residual_layernorm_scale_shift_gate_select01 (line 166) | def test_fused_residual_layernorm_scale_shift_gate_select01(
FILE: python/sglang/jit_kernel/tests/test_renorm.py
function test_top_k_renorm_probs (line 12) | def test_top_k_renorm_probs(batch_size, vocab_size, k):
function test_top_p_renorm_probs (line 47) | def test_top_p_renorm_probs(batch_size, vocab_size, p):
function test_top_k_mask_logits (line 82) | def test_top_k_mask_logits(batch_size, vocab_size, k, neginf_input):
FILE: python/sglang/jit_kernel/tests/test_rmsnorm.py
function sglang_jit_rmsnorm (line 10) | def sglang_jit_rmsnorm(input: torch.Tensor, weight: torch.Tensor) -> None:
function flashinfer_rmsnorm (line 16) | def flashinfer_rmsnorm(input: torch.Tensor, weight: torch.Tensor) -> None:
function test_rmsnorm (line 36) | def test_rmsnorm(batch_size: int, hidden_size: int) -> None:
FILE: python/sglang/jit_kernel/tests/test_rope.py
function create_cos_sin_cache (line 14) | def create_cos_sin_cache(
function sglang_jit_rope (line 40) | def sglang_jit_rope(
function flashinfer_rope (line 52) | def flashinfer_rope(
function torch_impl_rope (line 75) | def torch_impl_rope(
function test_rope (line 110) | def test_rope(
function test_rope_position_dtypes (line 138) | def test_rope_position_dtypes(dtype: torch.dtype) -> None:
function test_partial_rope (line 163) | def test_partial_rope(batch_size: int, is_neox: bool, rope_dim: int, hea...
function test_fused_rope_store (line 190) | def test_fused_rope_store(
FILE: python/sglang/jit_kernel/tests/test_store_cache.py
function test_store_cache (line 24) | def test_store_cache(batch_size: int, element_dim: int) -> None:
function test_store_cache_dtypes (line 49) | def test_store_cache_dtypes(
function test_store_cache_int32_indices (line 68) | def test_store_cache_int32_indices(batch_size: int, element_dim: int) ->...
function _valid_num_splits (line 82) | def _valid_num_splits(element_dim: int, dtype: torch.dtype) -> list:
function test_store_cache_num_split (line 102) | def test_store_cache_num_split(
function test_can_use_store_cache (line 119) | def test_can_use_store_cache() -> None:
FILE: python/sglang/jit_kernel/tests/test_timestep_embedding.py
function get_timestep_embedding_reference (line 37) | def get_timestep_embedding_reference(
function test_timestep_embedding_correctness_with_sgld (line 73) | def test_timestep_embedding_correctness_with_sgld(batch_size, dim, dtype):
function test_timestep_embedding_correctness_with_diffusers (line 91) | def test_timestep_embedding_correctness_with_diffusers(
function test_timestep_embedding_perf (line 115) | def test_timestep_embedding_perf():
FILE: python/sglang/jit_kernel/timestep_embedding.py
function _jit_timestep_embedding_module (line 14) | def _jit_timestep_embedding_module(dtype: torch.dtype) -> Module:
function timestep_embedding (line 24) | def timestep_embedding(
FILE: python/sglang/jit_kernel/utils.py
function is_in_ci (line 17) | def is_in_ci() -> bool:
function should_run_full_tests (line 22) | def should_run_full_tests() -> bool:
function get_ci_test_range (line 26) | def get_ci_test_range(full_range: List[Any], ci_range: List[Any]) -> Lis...
function cache_once (line 32) | def cache_once(fn: F) -> F:
function _make_wrapper (line 49) | def _make_wrapper(tup: Tuple[str, str]) -> str:
function _resolve_kernel_path (line 55) | def _resolve_kernel_path() -> pathlib.Path:
class CPPArgList (line 86) | class CPPArgList(list[str]):
method __str__ (line 87) | def __str__(self) -> str:
function is_hip_runtime (line 104) | def is_hip_runtime() -> bool:
function make_cpp_args (line 108) | def make_cpp_args(*args: CPP_TEMPLATE_TYPE) -> CPPArgList:
function load_jit (line 121) | def load_jit(
function is_arch_support_pdl (line 215) | def is_arch_support_pdl() -> bool:
function _get_cuda_arch_value (line 223) | def _get_cuda_arch_value() -> int:
function _get_cuda_arch_list (line 231) | def _get_cuda_arch_list() -> str:
FILE: python/sglang/lang/api.py
function function (line 23) | def function(
function Runtime (line 35) | def Runtime(*args, **kwargs):
function Engine (line 42) | def Engine(*args, **kwargs):
function set_default_backend (line 49) | def set_default_backend(backend: BaseBackend):
function flush_cache (line 53) | def flush_cache(backend: Optional[BaseBackend] = None):
function get_server_info (line 64) | def get_server_info(backend: Optional[BaseBackend] = None):
function gen (line 75) | def gen(
function gen_int (line 142) | def gen_int(
function gen_string (line 185) | def gen_string(
function image (line 228) | def image(expr: SglExpr):
function video (line 232) | def video(path: str, num_frames: int):
function select (line 236) | def select(
function _role_common (line 246) | def _role_common(name: str, expr: Optional[SglExpr] = None):
function system (line 253) | def system(expr: Optional[SglExpr] = None):
function user (line 257) | def user(expr: Optional[SglExpr] = None):
function assistant (line 261) | def assistant(expr: Optional[SglExpr] = None):
function system_begin (line 265) | def system_begin():
function system_end (line 269) | def system_end():
function user_begin (line 273) | def user_begin():
function user_end (line 277) | def user_end():
function assistant_begin (line 281) | def assistant_begin():
function assistant_end (line 285) | def assistant_end():
function separate_reasoning (line 289) | def separate_reasoning(
FILE: python/sglang/lang/backend/anthropic.py
class Anthropic (line 12) | class Anthropic(BaseBackend):
method __init__ (line 13) | def __init__(self, model_name, *args, **kwargs):
method get_chat_template (line 23) | def get_chat_template(self):
method generate (line 26) | def generate(
method generate_stream (line 51) | def generate_stream(
FILE: python/sglang/lang/backend/base_backend.py
class BaseBackend (line 9) | class BaseBackend:
method __init__ (line 10) | def __init__(self) -> None:
method get_model_name (line 14) | def get_model_name(self):
method get_chat_template (line 17) | def get_chat_template(self):
method cache_prefix (line 20) | def cache_prefix(self, prefix_str: str):
method uncache_prefix (line 23) | def uncache_prefix(self, rid: str):
method end_request (line 26) | def end_request(self, rid: Union[str, List[str]]):
method begin_program (line 29) | def begin_program(self, s: StreamExecutor):
method end_program (line 32) | def end_program(self, s: Union[StreamExecutor, List[StreamExecutor]]):
method commit_lazy_operations (line 35) | def commit_lazy_operations(self, s: StreamExecutor):
method fork_program (line 38) | def fork_program(
method fill_image (line 46) | def fill_image(self, s: StreamExecutor):
method generate (line 49) | def generate(
method generate_stream (line 56) | def generate_stream(
method select (line 63) | def select(
method concatenate_and_append (line 72) | def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
method shutdown (line 75) | def shutdown(self):
method flush_cache (line 78) | def flush_cache(self):
method get_server_info (line 81) | def get_server_info(self):
FILE: python/sglang/lang/backend/litellm.py
class LiteLLM (line 15) | class LiteLLM(BaseBackend):
method __init__ (line 16) | def __init__(
method get_chat_template (line 47) | def get_chat_template(self):
method generate (line 50) | def generate(
method generate_stream (line 70) | def generate_stream(
FILE: python/sglang/lang/backend/openai.py
function create_logit_bias_int (line 25) | def create_logit_bias_int(tokenizer):
class TokenUsage (line 48) | class TokenUsage:
method reset (line 52) | def reset(self):
class OpenAI (line 56) | class OpenAI(BaseBackend):
method __init__ (line 57) | def __init__(
method get_chat_template (line 106) | def get_chat_template(self):
method _prepare_spec_execution (line 109) | def _prepare_spec_execution(
method generate (line 140) | def generate(
method spec_fill (line 224) | def spec_fill(self, value: str):
method spec_pattern_match (line 228) | def spec_pattern_match(self, comp):
method role_end_generate (line 248) | def role_end_generate(
method generate_stream (line 283) | def generate_stream(
method select (line 312) | def select(
function openai_completion (line 383) | def openai_completion(
function openai_completion_stream (line 425) | def openai_completion_stream(
FILE: python/sglang/lang/backend/runtime_endpoint.py
class RuntimeEndpoint (line 26) | class RuntimeEndpoint(BaseBackend):
method __init__ (line 27) | def __init__(
method get_model_name (line 56) | def get_model_name(self):
method flush_cache (line 59) | def flush_cache(self):
method get_server_info (line 68) | def get_server_info(self):
method get_chat_template (line 77) | def get_chat_template(self):
method cache_prefix (line 80) | def cache_prefix(self, prefix_str: str):
method start_profile (line 89) | def start_profile(self):
method stop_profile (line 97) | def stop_profile(self):
method commit_lazy_operations (line 105) | def commit_lazy_operations(self, s: StreamExecutor):
method fill_image (line 116) | def fill_image(self, s: StreamExecutor):
method _handle_dtype_to_regex (line 127) | def _handle_dtype_to_regex(self, sampling_params: SglSamplingParams):
method generate (line 159) | def generate(
method generate_stream (line 198) | def generate_stream(
method select (line 248) | def select(
method concatenate_and_append (line 317) | def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
method _generate_http_request (line 326) | def _generate_http_request(self, s: StreamExecutor, data):
method _add_images (line 337) | def _add_images(self, s: StreamExecutor, data):
method _assert_success (line 342) | def _assert_success(self, res):
function compute_normalized_prompt_logprobs (line 351) | def compute_normalized_prompt_logprobs(input_logprobs):
class Runtime (line 356) | class Runtime:
method __init__ (line 366) | def __init__(
method shutdown (line 436) | def shutdown(self):
method start_profile (line 443) | def start_profile(self):
method stop_profile (line 446) | def stop_profile(self):
method cache_prefix (line 449) | def cache_prefix(self, prefix: str):
method get_tokenizer (line 452) | def get_tokenizer(self):
method async_generate (line 462) | async def async_generate(
method generate (line 500) | def generate(
method encode (line 524) | def encode(
method get_server_info (line 532) | async def get_server_info(self):
method __del__ (line 543) | def __del__(self):
FILE: python/sglang/lang/backend/vertexai.py
class VertexAI (line 20) | class VertexAI(BaseBackend):
method __init__ (line 21) | def __init__(self, model_name, safety_settings=None):
method get_chat_template (line 35) | def get_chat_template(self):
method generate (line 38) | def generate(
method generate_stream (line 62) | def generate_stream(
method text_to_vertexai_input (line 85) | def text_to_vertexai_input(self, text, images):
method messages_to_vertexai_input (line 99) | def messages_to_vertexai_input(self, messages):
FILE: python/sglang/lang/chat_template.py
class ChatTemplateStyle (line 7) | class ChatTemplateStyle(Enum):
class ChatTemplate (line 13) | class ChatTemplate:
method get_prefix_and_suffix (line 22) | def get_prefix_and_suffix(
method get_prompt (line 43) | def get_prompt(self, messages: List[Dict]) -> str:
function register_chat_template (line 61) | def register_chat_template(template):
function register_chat_template_matching_function (line 65) | def register_chat_template_matching_function(func):
function get_chat_template (line 69) | def get_chat_template(name):
function get_chat_template_by_model_path (line 73) | def get_chat_template_by_model_path(model_path):
function match_deepseek (line 528) | def match_deepseek(model_path: str):
function match_orion (line 536) | def match_orion(model_path: str):
function match_deepseek_janus_pro (line 542) | def match_deepseek_janus_pro(model_path: str):
function match_dbrx (line 548) | def match_dbrx(model_path: str):
function match_vicuna (line 556) | def match_vicuna(model_path: str):
function match_llama2_chat (line 562) | def match_llama2_chat(model_path: str):
function match_mistral (line 572) | def match_mistral(model_path: str):
function match_llama3_instruct (line 578) | def match_llama3_instruct(model_path: str):
function match_chat_ml (line 584) | def match_chat_ml(model_path: str):
function match_chat_yi (line 604) | def match_chat_yi(model_path: str):
function match_gemma_it (line 614) | def match_gemma_it(model_path: str):
function match_openbmb_minicpm (line 620) | def match_openbmb_minicpm(model_path: str):
function match_c4ai_command_r (line 628) | def match_c4ai_command_r(model_path: str):
function match_granite_instruct (line 634) | def match_granite_instruct(model_path: str):
function match_gemma3_instruct (line 640) | def match_gemma3_instruct(model_path: str):
function match_internvl_chat (line 646) | def match_internvl_chat(model_path: str):
function match_interns1_chat (line 652) | def match_interns1_chat(model_path: str):
FILE: python/sglang/lang/choices.py
class ChoicesDecision (line 9) | class ChoicesDecision:
class ChoicesSamplingMethod (line 14) | class ChoicesSamplingMethod(ABC):
method requires_unconditional_logprobs (line 17) | def requires_unconditional_logprobs(self) -> bool:
method __call__ (line 21) | def __call__(
class TokenLengthNormalized (line 32) | class TokenLengthNormalized(ChoicesSamplingMethod):
method __call__ (line 34) | def __call__(
class GreedyTokenSelection (line 56) | class GreedyTokenSelection(ChoicesSamplingMethod):
method __call__ (line 58) | def __call__(
method _build_logprob_matrix (line 87) | def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_...
method _greedy_selection (line 97) | def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
class UnconditionalLikelihoodNormalized (line 110) | class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
method requires_unconditional_logprobs (line 113) | def requires_unconditional_logprobs(self) -> bool:
method __call__ (line 116) | def __call__(
method _normalize_logprobs (line 150) | def _normalize_logprobs(self, input_token_logprobs, unconditional_toke...
FILE: python/sglang/lang/interpreter.py
function run_internal (line 42) | def run_internal(state, program, func_args, func_kwargs, sync):
function run_program (line 57) | def run_program(
function run_program_batch (line 93) | def run_program_batch(
function _run_program_batch_generator (line 184) | def _run_program_batch_generator(
function cache_program (line 242) | def cache_program(program, backend):
class StreamExecutor (line 250) | class StreamExecutor:
method __init__ (line 253) | def __init__(
method submit (line 318) | def submit(self, expr: SglExpr):
method sync (line 326) | def sync(self):
method get_var (line 330) | def get_var(self, name):
method set_var (line 335) | def set_var(self, name, value):
method get_meta_info (line 338) | def get_meta_info(self, name, timeout=None):
method fork (line 346) | def fork(
method text (line 380) | def text(self):
method messages (line 384) | def messages(self):
method error (line 388) | def error(self):
method end (line 392) | def end(self):
method _thread_worker_func (line 398) | def _thread_worker_func(self):
method _execute (line 437) | def _execute(self, other):
method _execute_fill (line 481) | def _execute_fill(self, value: str, prefix=False):
method _execute_image (line 500) | def _execute_image(self, expr: SglImage):
method _execute_video (line 509) | def _execute_video(self, expr: SglVideo):
method _spec_gen (line 519) | def _spec_gen(self, sampling_params):
method _execute_gen (line 569) | def _execute_gen(self, expr: SglGen):
method _execute_select (line 623) | def _execute_select(self, expr: SglSelect):
method _execute_variable (line 636) | def _execute_variable(self, expr: SglVariable):
method _execute_role_begin (line 641) | def _execute_role_begin(self, expr: SglRoleBegin):
method _execute_role_end (line 659) | def _execute_role_end(self, expr: SglRoleEnd):
method _execute_var_scope_begin (line 695) | def _execute_var_scope_begin(self, expr: SglVarScopeBegin):
method _execute_var_scope_end (line 698) | def _execute_var_scope_end(self, expr: SglVarScopeEnd):
method _execute_commit_lazy_operations (line 702) | def _execute_commit_lazy_operations(self, expr: SglCommitLazy):
method _execute_concatenate_and_append_text (line 705) | def _execute_concatenate_and_append_text(self, expr: SglConcateAndAppe...
method _execute_concatenate_and_append_kv_cache (line 714) | def _execute_concatenate_and_append_kv_cache(self, expr: SglConcateAnd...
method _execute_separate_reasoning (line 730) | def _execute_separate_reasoning(self, expr: SglSeparateReasoning):
method _init_var_event (line 764) | def _init_var_event(self, expr):
method _resolve_sampling_params (line 775) | def _resolve_sampling_params(self, sampling_params):
method __del__ (line 824) | def __del__(self):
class ProgramState (line 828) | class ProgramState:
method __init__ (line 831) | def __init__(self, stream_executor: StreamExecutor):
method _role_common (line 834) | def _role_common(self, name: str, expr: Optional[SglExpr] = None):
method system (line 849) | def system(self, expr: Optional[SglExpr] = None):
method user (line 852) | def user(self, expr: Optional[SglExpr] = None):
method assistant (line 855) | def assistant(self, expr: Optional[SglExpr] = None):
method var_scope (line 859) | def var_scope(self, name: str):
method fork (line 864) | def fork(
method copy (line 875) | def copy(self, position_ids_offset: Optional[List[int]] = None):
method text (line 882) | def text(self):
method messages (line 885) | def messages(self):
method sync (line 888) | def sync(self):
method error (line 891) | def error(self):
method text_iter (line 894) | def text_iter(self, var_name: Optional[str] = None):
method text_async_iter (line 932) | async def text_async_iter(
method get_var (line 977) | def get_var(self, name):
method set_var (line 980) | def set_var(self, name, value):
method get_meta_info (line 983) | def get_meta_info(self, name):
method __iadd__ (line 986) | def __iadd__(self, other):
method __getitem__ (line 992) | def __getitem__(self, name):
method __setitem__ (line 995) | def __setitem__(self, name, value):
method __contains__ (line 998) | def __contains__(self, name):
method __del__ (line 1001) | def __del__(self):
method __repr__ (line 1004) | def __repr__(self) -> str:
class ProgramStateGroup (line 1008) | class ProgramStateGroup:
method __init__ (line 1009) | def __init__(
method join (line 1015) | def join(self, mode: str = "gather_variable"):
method __getitem__ (line 1041) | def __getitem__(self, i: int):
method __setitem__ (line 1044) | def __setitem__(self, i: int, value):
method __iadd__ (line 1047) | def __iadd__(self, other):
FILE: python/sglang/lang/ir.py
class SglSamplingParams (line 18) | class SglSamplingParams:
method clone (line 42) | def clone(self):
method to_openai_kwargs (line 64) | def to_openai_kwargs(self):
method to_vertexai_kwargs (line 79) | def to_vertexai_kwargs(self):
method to_anthropic_kwargs (line 93) | def to_anthropic_kwargs(self):
method to_litellm_kwargs (line 109) | def to_litellm_kwargs(self):
method to_srt_kwargs (line 121) | def to_srt_kwargs(self):
class SglFunction (line 141) | class SglFunction:
method __init__ (line 142) | def __init__(self, func, num_api_spec_tokens=None, bind_arguments=None):
method bind (line 154) | def bind(self, **kwargs):
method run (line 160) | def run(
method run_batch (line 223) | def run_batch(
method trace (line 304) | def trace(self, *, backend=None, **kwargs):
method cache (line 310) | def cache(self, backend=None):
method __call__ (line 316) | def __call__(self, *args, **kwargs):
class SglExpr (line 327) | class SglExpr:
method __init__ (line 330) | def __init__(self):
method __add__ (line 336) | def __add__(self, other):
method __radd__ (line 343) | def __radd__(self, other):
method concatenate_ir (line 350) | def concatenate_ir(self, a, b):
method print_graph_dfs (line 361) | def print_graph_dfs(self):
class SglExprList (line 397) | class SglExprList(SglExpr):
method __init__ (line 398) | def __init__(self, expr_list: List[SglExpr]):
method __repr__ (line 402) | def __repr__(self):
class SglArgument (line 406) | class SglArgument(SglExpr):
method __init__ (line 407) | def __init__(self, name: str, value: str):
method __repr__ (line 412) | def __repr__(self):
method __len__ (line 415) | def __len__(self):
method __getitem__ (line 418) | def __getitem__(self, i):
method __int__ (line 421) | def __int__(self):
method __bool__ (line 424) | def __bool__(self):
method __format__ (line 427) | def __format__(self, *args):
class SglImage (line 434) | class SglImage(SglExpr):
method __init__ (line 435) | def __init__(self, path: str):
method __repr__ (line 438) | def __repr__(self) -> str:
class SglVideo (line 442) | class SglVideo(SglExpr):
method __init__ (line 443) | def __init__(self, path: str, num_frames: int):
method __repr__ (line 447) | def __repr__(self) -> str:
class SglGen (line 451) | class SglGen(SglExpr):
method __init__ (line 452) | def __init__(
method __repr__ (line 502) | def __repr__(self):
class SglConstantText (line 506) | class SglConstantText(SglExpr):
method __init__ (line 507) | def __init__(self, value: str):
method __repr__ (line 511) | def __repr__(self):
class SglRoleBegin (line 515) | class SglRoleBegin(SglExpr):
method __init__ (line 516) | def __init__(self, role: str):
method __repr__ (line 520) | def __repr__(self):
class SglRoleEnd (line 524) | class SglRoleEnd(SglExpr):
method __init__ (line 525) | def __init__(self, role: str):
method __repr__ (line 529) | def __repr__(self):
class SglSelect (line 533) | class SglSelect(SglExpr):
method __init__ (line 535) | def __init__(
method __repr__ (line 548) | def __repr__(self):
class SglFork (line 552) | class SglFork(SglExpr):
method __init__ (line 553) | def __init__(self, number: int, position_ids_offset=None):
method __repr__ (line 558) | def __repr__(self):
class SglGetForkItem (line 565) | class SglGetForkItem(SglExpr):
method __init__ (line 566) | def __init__(self, index: int):
method __repr__ (line 570) | def __repr__(self):
class SglVariable (line 574) | class SglVariable(SglExpr):
method __init__ (line 575) | def __init__(self, name: str, source):
method __repr__ (line 580) | def __repr__(self):
class SglVarScopeBegin (line 584) | class SglVarScopeBegin(SglExpr):
method __init__ (line 585) | def __init__(self, name: str):
method __repr__ (line 589) | def __repr__(self):
class SglVarScopeEnd (line 593) | class SglVarScopeEnd(SglExpr):
method __init__ (line 594) | def __init__(self, name: str):
method __repr__ (line 598) | def __repr__(self):
class SglConcateAndAppend (line 602) | class SglConcateAndAppend(SglExpr):
method __init__ (line 603) | def __init__(self, states):
method __repr__ (line 607) | def __repr__(self):
class SglCommitLazy (line 611) | class SglCommitLazy(SglExpr):
method __init__ (line 612) | def __init__(self):
method __repr__ (line 615) | def __repr__(self):
class SglSeparateReasoning (line 619) | class SglSeparateReasoning(SglExpr):
method __init__ (line 620) | def __init__(self, model_type: str, expr: SglExpr):
method process_name_for_reasoning (line 628) | def process_name_for_reasoning(self, name):
method _process_expr (line 633) | def _process_expr(self, expr):
method __repr__ (line 642) | def __repr__(self):
FILE: python/sglang/lang/tracer.py
class StopTracing (line 25) | class StopTracing(Exception):
function extract_prefix_by_tracing (line 29) | def extract_prefix_by_tracing(program, backend):
function trace_program (line 54) | def trace_program(program, arguments, backend):
class TracerProgramState (line 75) | class TracerProgramState(ProgramState):
method __init__ (line 76) | def __init__(self, backend, arguments, only_trace_prefix):
method fork (line 108) | def fork(self, size: int = 1, position_ids_offset: Optional[List[int]]...
method _append_node (line 139) | def _append_node(self, other: SglExpr):
method _execute (line 144) | def _execute(self, other: SglExpr):
method __iadd__ (line 175) | def __iadd__(self, other):
method _execute_fill (line 179) | def _execute_fill(self, expr: SglConstantText):
method _execute_gen (line 184) | def _execute_gen(self, expr: SglGen):
method _execute_select (line 190) | def _execute_select(self, expr: SglSelect):
method _execute_role_begin (line 198) | def _execute_role_begin(self, expr: SglRoleBegin):
method _execute_role_end (line 217) | def _execute_role_end(self, expr: SglRoleEnd):
method _execute_var_scope_end (line 228) | def _execute_var_scope_end(self, expr: SglVarScopeEnd):
method get_var (line 232) | def get_var(self, name):
method flatten_nodes (line 240) | def flatten_nodes(self):
method __del__ (line 253) | def __del__(self):
class TracingScope (line 257) | class TracingScope:
method __init__ (line 260) | def __init__(self, tracer_state: TracerProgramState):
method __enter__ (line 264) | def __enter__(self):
method __exit__ (line 268) | def __exit__(self, exc_type, exc_value, traceback):
method get_current_scope (line 272) | def get_current_scope():
method add_child_state (line 275) | def add_child_state(self, state: TracerProgramState):
FILE: python/sglang/launch_server.py
function run_server (line 15) | def run_server(server_args):
FILE: python/sglang/multimodal_gen/.claude/skills/diffusion-kernel/scripts/bench_diffusion_denoise.py
function required_gpus_for_model (line 230) | def required_gpus_for_model(model_key: str) -> int:
function build_sglang_cmd (line 238) | def build_sglang_cmd(
function run_benchmark_once (line 284) | def run_benchmark_once(
function print_results_table (line 377) | def print_results_table(results: list[dict]):
function inject_kernels_example (line 420) | def inject_kernels_example():
function main (line 475) | def main():
FILE: python/sglang/multimodal_gen/.claude/skills/diffusion-kernel/scripts/bench_diffusion_rmsnorm.py
function pytorch_rmsnorm (line 52) | def pytorch_rmsnorm(
function benchmark_kernel (line 64) | def benchmark_kernel(
function run_benchmark (line 86) | def run_benchmark():
FILE: python/sglang/multimodal_gen/.claude/skills/diffusion-kernel/scripts/diffusion_skill_env.py
function get_repo_root (line 16) | def get_repo_root() -> Path:
function get_assets_dir (line 22) | def get_assets_dir(repo_root: Path | None = None) -> Path:
function get_output_dir (line 27) | def get_output_dir(name: str, repo_root: Path | None = None) -> Path:
function ensure_dir (line 34) | def ensure_dir(path: Path) -> Path:
function check_write_access (line 39) | def check_write_access(repo_root: Path | None = None) -> Path:
function _run_nvidia_smi (line 47) | def _run_nvidia_smi(query: str) -> list[list[str]]:
function get_gpu_inventory (line 63) | def get_gpu_inventory() -> list[dict[str, int | str]]:
function get_busy_gpu_uuids (line 79) | def get_busy_gpu_uuids() -> set[str]:
function pick_idle_gpus (line 84) | def pick_idle_gpus(
function configure_runtime_env (line 107) | def configure_runtime_env(required_gpus: int = 1) -> str | None:
function main (line 116) | def main() -> None:
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/core/generator.py
class SGLDiffusionGenerator (line 35) | class SGLDiffusionGenerator:
method __init__ (line 38) | def __init__(self):
method __del__ (line 57) | def __del__(self):
method init_generator (line 60) | def init_generator(
method kill_generator (line 77) | def kill_generator(self):
method close_generator (line 115) | def close_generator(self):
method get_comfyui_model (line 126) | def get_comfyui_model(self, model_path: str, model_options: dict = None):
method load_model (line 185) | def load_model(
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/core/model_patcher.py
class SGLDModelPatcher (line 10) | class SGLDModelPatcher(ModelPatcher):
method __init__ (line 13) | def __init__(
method clone (line 32) | def clone(self):
method model_size (line 53) | def model_size(self):
method load (line 60) | def load(
method patch_model (line 70) | def patch_model(
method unpatch_model (line 80) | def unpatch_model(self, device_to=None, unpatch_weights=True):
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/core/server_api.py
class SGLDiffusionServerAPI (line 16) | class SGLDiffusionServerAPI:
method __init__ (line 19) | def __init__(self, base_url: str, api_key: str = "sk-proj-1234567890"):
method get_model_info (line 41) | def get_model_info(self) -> Dict[str, Any]:
method generate_image (line 63) | def generate_image(
method generate_video (line 205) | def generate_video(
method _build_image_common_params (line 351) | def _build_image_common_params(
method _get_content_type (line 399) | def _get_content_type(self, file_path: str) -> str:
method decode_image_from_response (line 410) | def decode_image_from_response(
method set_lora (line 442) | def set_lora(
method unset_lora (line 489) | def unset_lora(
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/executors/base.py
class SGLDiffusionExecutor (line 8) | class SGLDiffusionExecutor(torch.nn.Module):
method __init__ (line 11) | def __init__(self, generator, model_path, model, config):
method should_suppress_logs (line 21) | def should_suppress_logs(timestep):
method set_lora (line 27) | def set_lora(self, lora_nickname=None, lora_path=None, strength=None, ...
method _unpack_latents (line 37) | def _unpack_latents(self, latents, height, width, channels):
method _pack_latents (line 46) | def _pack_latents(self, latents):
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/executors/flux.py
class FluxExecutor (line 18) | class FluxExecutor(SGLDiffusionExecutor):
method __init__ (line 21) | def __init__(self, generator, model_path, model, config):
method forward (line 24) | def forward(self, x, timestep, context, y=None, guidance=None, **kwargs):
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/executors/qwen_image.py
class QwenImageExecutor (line 20) | class QwenImageExecutor(SGLDiffusionExecutor):
method __init__ (line 23) | def __init__(self, generator, model_path, model, config):
method _pack_latents (line 27) | def _pack_latents(self, x):
method _unpack_latents (line 52) | def _unpack_latents(self, latents, num_embeds, orig_shape, x):
method forward (line 67) | def forward(self, x, timestep, context, **kwargs):
class QwenImageEditExecutor (line 108) | class QwenImageEditExecutor(QwenImageExecutor):
method __init__ (line 111) | def __init__(self, generator, model_path, model, config):
method forward (line 114) | def forward(
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/executors/zimage.py
class ZImageExecutor (line 18) | class ZImageExecutor(SGLDiffusionExecutor):
method __init__ (line 21) | def __init__(self, generator, model_path, model, config):
method forward (line 24) | def forward(self, x, timesteps, context, **kwargs):
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/nodes.py
class SGLDOptions (line 21) | class SGLDOptions:
method INPUT_TYPES (line 23) | def INPUT_TYPES(cls):
method create_options (line 72) | def create_options(
class SGLDLoraLoader (line 113) | class SGLDLoraLoader:
method INPUT_TYPES (line 115) | def INPUT_TYPES(cls):
method load_lora (line 137) | def load_lora(
class SGLDUNETLoader (line 166) | class SGLDUNETLoader:
method __init__ (line 167) | def __init__(self):
method INPUT_TYPES (line 171) | def INPUT_TYPES(s):
method load_unet (line 187) | def load_unet(self, unet_name, weight_dtype, sgld_options: dict = None):
class SGLDiffusionServerModel (line 202) | class SGLDiffusionServerModel:
method INPUT_TYPES (line 206) | def INPUT_TYPES(cls):
method load_server (line 231) | def load_server(self, base_url: str, api_key: str):
class SGLDiffusionGenerateImage (line 246) | class SGLDiffusionGenerateImage:
method INPUT_TYPES (line 250) | def INPUT_TYPES(cls):
method generate_image (line 336) | def generate_image(
class SGLDiffusionGenerateVideo (line 397) | class SGLDiffusionGenerateVideo:
method INPUT_TYPES (line 401) | def INPUT_TYPES(cls):
method generate_video (line 514) | def generate_video(
class SGLDiffusionServerSetLora (line 579) | class SGLDiffusionServerSetLora:
method INPUT_TYPES (line 583) | def INPUT_TYPES(cls):
method set_lora (line 624) | def set_lora(
class SGLDiffusionServerUnsetLora (line 650) | class SGLDiffusionServerUnsetLora:
method INPUT_TYPES (line 654) | def INPUT_TYPES(cls):
method unset_lora (line 681) | def unset_lora(
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/test/test_flux_pipeline.py
function test_comfyui_flux_pipeline_direct (line 13) | def test_comfyui_flux_pipeline_direct() -> None:
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/test/test_qwen_image_edit_pipeline.py
function test_comfyui_qwen_image_edit_pipeline_direct (line 13) | def test_comfyui_qwen_image_edit_pipeline_direct() -> None:
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/test/test_qwen_image_pipeline.py
function test_comfyui_qwen_image_pipeline_direct (line 13) | def test_comfyui_qwen_image_pipeline_direct() -> None:
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/test/test_zimage_pipeline.py
function test_comfyui_zimage_pipeline_direct (line 13) | def test_comfyui_zimage_pipeline_direct() -> None:
FILE: python/sglang/multimodal_gen/apps/ComfyUI_SGLDiffusion/utils.py
function _ensure_dir (line 15) | def _ensure_dir(path: str) -> None:
function _to_numpy_image (line 19) | def _to_numpy_image(image: torch.Tensor) -> np.ndarray:
function _to_hwc_tensor (line 35) | def _to_hwc_tensor(image: torch.Tensor) -> torch.Tensor:
function is_empty_image (line 52) | def is_empty_image(image: torch.Tensor, tolerance: float = 1e-6) -> bool:
function get_image_path (line 80) | def get_image_path(image: torch.Tensor) -> str:
function convert_b64_to_tensor_image (line 100) | def convert_b64_to_tensor_image(b64_image: str) -> torch.Tensor:
class SGLDVideoInput (line 131) | class SGLDVideoInput(VideoInput):
method __init__ (line 132) | def __init__(self, video_path: str, height: int, width: int):
method get_dimensions (line 139) | def get_dimensions(self) -> tuple[int, int]:
method get_components (line 148) | def get_components(self):
method save_to (line 155) | def save_to(self, path: str, format=None, codec=None, metadata=None):
function convert_video_to_comfy_video (line 169) | def convert_video_to_comfy_video(
FILE: python/sglang/multimodal_gen/apps/webui/main.py
function add_webui_args (line 20) | def add_webui_args(parser: argparse.ArgumentParser):
function run_sgl_diffusion_webui (line 27) | def run_sgl_diffusion_webui(server_args: ServerArgs):
FILE: python/sglang/multimodal_gen/benchmarks/bench_offline_throughput.py
class BatchOutput (line 50) | class BatchOutput:
class BenchArgs (line 63) | class BenchArgs:
method add_cli_args (line 91) | def add_cli_args(parser: argparse.ArgumentParser):
method from_cli_args (line 171) | def from_cli_args(cls, args: argparse.Namespace):
function initialize_engine (line 177) | def initialize_engine(server_args: ServerArgs) -> DiffGenerator:
function generate_batch (line 185) | def generate_batch(
function calculate_metrics (line 225) | def calculate_metrics(
function throughput_test (line 265) | def throughput_test(
function display_results (line 349) | def display_results(
function save_results (line 390) | def save_results(
function main (line 421) | def main():
FILE: python/sglang/multimodal_gen/benchmarks/bench_serving.py
function _compute_scale_factor (line 50) | def _compute_scale_factor(req: RequestFuncInput, args) -> Optional[float]:
function _compute_expected_latency_ms_from_base (line 66) | def _compute_expected_latency_ms_from_base(
function _infer_slo_base_time_ms_from_warmups (line 78) | def _infer_slo_base_time_ms_from_warmups(
function _populate_slo_ms_from_warmups (line 99) | def _populate_slo_ms_from_warmups(
function async_request_image_sglang (line 129) | async def async_request_image_sglang(
function async_request_video_sglang (line 223) | async def async_request_video_sglang(
function calculate_metrics (line 380) | def calculate_metrics(
function wait_for_service (line 434) | def wait_for_service(base_url: str, timeout: int = 1200) -> None:
function benchmark (line 455) | async def benchmark(args):
FILE: python/sglang/multimodal_gen/benchmarks/compare_perf.py
function calculate_diff (line 9) | def calculate_diff(base: float, new: float) -> Tuple[float, float]:
function calculate_upper_bound (line 19) | def calculate_upper_bound(baseline: float, rel_tol: float, min_abs_tol: ...
function calculate_lower_bound (line 26) | def calculate_lower_bound(baseline: float, rel_tol: float, min_abs_tol: ...
function get_perf_status_emoji (line 33) | def get_perf_status_emoji(
function consolidate_steps (line 57) | def consolidate_steps(
function _load_benchmark_file (line 106) | def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
function _get_status_emoji_from_diff_percent (line 112) | def _get_status_emoji_from_diff_percent(diff_pct):
function _print_single_comparison_report (line 121) | def _print_single_comparison_report(
function _print_multi_comparison_report (line 168) | def _print_multi_comparison_report(
function compare_benchmarks (line 214) | def compare_benchmarks(file_paths: List[str], output_format: str = "mark...
FILE: python/sglang/multimodal_gen/benchmarks/datasets.py
class RequestFuncInput (line 20) | class RequestFuncInput:
class RequestFuncOutput (line 36) | class RequestFuncOutput:
function is_dir_not_empty (line 46) | def is_dir_not_empty(path: str) -> bool:
class BaseDataset (line 50) | class BaseDataset(ABC):
method __init__ (line 51) | def __init__(self, args, api_url: str = "", model: str = ""):
method __len__ (line 58) | def __len__(self) -> int:
method __getitem__ (line 62) | def __getitem__(self, idx: int) -> RequestFuncInput:
method get_requests (line 65) | def get_requests(self) -> List[RequestFuncInput]:
class VBenchDataset (line 69) | class VBenchDataset(BaseDataset):
method __init__ (line 78) | def __init__(self, args, api_url: str = "", model: str = ""):
method _load_data (line 83) | def _load_data(self) -> List[Dict[str, Any]]:
method _download_file (line 93) | def _download_file(self, url: str, dest_path: str) -> None:
method _load_t2v_prompts (line 101) | def _load_t2v_prompts(self) -> List[Dict[str, Any]]:
method _auto_download_i2v_dataset (line 123) | def _auto_download_i2v_dataset(self) -> Optional[str]:
method _load_from_i2v_json (line 176) | def _load_from_i2v_json(self, json_path: str) -> List[Dict[str, Any]]:
method _scan_directory_for_images (line 197) | def _scan_directory_for_images(self, path: str) -> List[Dict[str, Any]]:
method _create_dummy_data (line 216) | def _create_dummy_data(self) -> List[Dict[str, Any]]:
method _load_i2v_data (line 229) | def _load_i2v_data(self) -> List[Dict[str, Any]]:
method _resize_data (line 256) | def _resize_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, A...
method __len__ (line 267) | def __len__(self) -> int:
method __getitem__ (line 270) | def __getitem__(self, idx: int) -> RequestFuncInput:
class RandomDataset (line 284) | class RandomDataset(BaseDataset):
method __init__ (line 285) | def __init__(self, args, api_url: str = "", model: str = ""):
method __len__ (line 289) | def __len__(self) -> int:
method __getitem__ (line 292) | def __getitem__(self, idx: int) -> RequestFuncInput:
FILE: python/sglang/multimodal_gen/configs/models/adapter/base.py
class AdapterArchConfig (line 10) | class AdapterArchConfig(ArchConfig):
method __post_init__ (line 39) | def __post_init__(self) -> None:
class AdapterConfig (line 45) | class AdapterConfig(ModelConfig):
method add_cli_args (line 52) | def add_cli_args(parser: Any, prefix: str = "dit-config") -> Any:
FILE: python/sglang/multimodal_gen/configs/models/adapter/ltx_2_connector.py
class LTX2ConnectorArchConfig (line 10) | class LTX2ConnectorArchConfig(AdapterArchConfig):
class LTX2ConnectorConfig (line 29) | class LTX2ConnectorConfig(AdapterConfig):
FILE: python/sglang/multimodal_gen/configs/models/base.py
class ArchConfig (line 16) | class ArchConfig:
method __getattr__ (line 22) | def __getattr__(self, name: str):
method __setattr__ (line 31) | def __setattr__(self, key, value):
class ModelConfig (line 44) | class ModelConfig:
method __getattr__ (line 52) | def __getattr__(self, name):
method __getstate__ (line 60) | def __getstate__(self):
method __setstate__ (line 66) | def __setstate__(self, state):
method update_model_arch (line 71) | def update_model_arch(self, source_model_dict: dict[str, Any]) -> None:
method update_model_config (line 83) | def update_model_config(self, source_model_dict: dict[str, Any]) -> None:
FILE: python/sglang/multimodal_gen/configs/models/bridges/mova_dual_tower.py
function _is_conditioner_block (line 9) | def _is_conditioner_block(name: str, module) -> bool:
class MOVADualTowerArchConfig (line 15) | class MOVADualTowerArchConfig(DiTArchConfig):
method __post_init__ (line 34) | def __post_init__(self):
class MOVADualTowerConfig (line 41) | class MOVADualTowerConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/base.py
class DiTArchConfig (line 13) | class DiTArchConfig(ArchConfig):
method __post_init__ (line 47) | def __post_init__(self) -> None:
class DiTConfig (line 53) | class DiTConfig(ModelConfig):
method add_cli_args (line 61) | def add_cli_args(parser: Any, prefix: str = "dit-config") -> Any:
FILE: python/sglang/multimodal_gen/configs/models/dits/flux.py
class FluxArchConfig (line 11) | class FluxArchConfig(DiTArchConfig):
method __post_init__ (line 68) | def __post_init__(self):
class FluxConfig (line 76) | class FluxConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/glmimage.py
class GlmImageArchConfig (line 7) | class GlmImageArchConfig(DiTArchConfig):
method __post_init__ (line 28) | def __post_init__(self):
class GlmImageDitConfig (line 36) | class GlmImageDitConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/helios.py
function is_blocks (line 7) | def is_blocks(n: str, m) -> bool:
class HeliosArchConfig (line 12) | class HeliosArchConfig(DiTArchConfig):
method __post_init__ (line 69) | def __post_init__(self):
class HeliosConfig (line 77) | class HeliosConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/hunyuan3d.py
class Hunyuan3DDiTArchConfig (line 8) | class Hunyuan3DDiTArchConfig(DiTArchConfig):
method __post_init__ (line 33) | def __post_init__(self) -> None:
class Hunyuan3DDiTConfig (line 40) | class Hunyuan3DDiTConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/hunyuanvideo.py
function is_double_block (line 11) | def is_double_block(n: str, m) -> bool:
function is_single_block (line 15) | def is_single_block(n: str, m) -> bool:
function is_refiner_block (line 19) | def is_refiner_block(n: str, m) -> bool:
function is_txt_in (line 23) | def is_txt_in(n: str, m) -> bool:
class HunyuanVideoArchConfig (line 28) | class HunyuanVideoArchConfig(DiTArchConfig):
method __post_init__ (line 174) | def __post_init__(self):
class HunyuanVideoConfig (line 181) | class HunyuanVideoConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/ltx_2.py
class LTXModelType (line 8) | class LTXModelType(Enum):
method is_video_enabled (line 20) | def is_video_enabled(self) -> bool:
method is_audio_enabled (line 23) | def is_audio_enabled(self) -> bool:
class LTX2RopeType (line 27) | class LTX2RopeType(str, Enum):
class LTX2AttentionFunction (line 39) | class LTX2AttentionFunction(str, Enum):
function is_blocks (line 50) | def is_blocks(n: str, m) -> bool:
class LTX2ArchConfig (line 55) | class LTX2ArchConfig(DiTArchConfig):
method __post_init__ (line 154) | def __post_init__(self):
class LTX2Config (line 171) | class LTX2Config(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/mova_audio.py
function _is_blocks (line 9) | def _is_blocks(n: str, m) -> bool:
class MOVAAudioArchConfig (line 14) | class MOVAAudioArchConfig(DiTArchConfig):
method __post_init__ (line 54) | def __post_init__(self):
class MOVAAudioConfig (line 65) | class MOVAAudioConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/mova_video.py
function _is_blocks (line 9) | def _is_blocks(n: str, m) -> bool:
class MOVAVideoArchConfig (line 14) | class MOVAVideoArchConfig(DiTArchConfig):
method __post_init__ (line 53) | def __post_init__(self):
class MOVAVideoConfig (line 64) | class MOVAVideoConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/qwenimage.py
class QwenImageArchConfig (line 11) | class QwenImageArchConfig(DiTArchConfig):
method __post_init__ (line 38) | def __post_init__(self):
class QwenImageEditPlus_2511_ArchConfig (line 46) | class QwenImageEditPlus_2511_ArchConfig(QwenImageArchConfig):
class QwenImageDitConfig (line 51) | class QwenImageDitConfig(DiTConfig):
class QwenImageEditPlus_2511_DitConfig (line 58) | class QwenImageEditPlus_2511_DitConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/sana.py
class SanaArchConfig (line 21) | class SanaArchConfig(DiTArchConfig):
method __post_init__ (line 48) | def __post_init__(self):
class SanaConfig (line 55) | class SanaConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/wanvideo.py
function is_blocks (line 9) | def is_blocks(n: str, m) -> bool:
class WanVideoArchConfig (line 14) | class WanVideoArchConfig(DiTArchConfig):
method __post_init__ (line 94) | def __post_init__(self):
class WanVideoConfig (line 102) | class WanVideoConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/dits/zimage.py
function is_zimage_layer (line 10) | def is_zimage_layer(n: str, m) -> bool:
class ZImageArchConfig (line 22) | class ZImageArchConfig(DiTArchConfig):
method __post_init__ (line 67) | def __post_init__(self):
class ZImageDitConfig (line 75) | class ZImageDitConfig(DiTConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/base.py
class EncoderArchConfig (line 15) | class EncoderArchConfig(ArchConfig):
class TextEncoderArchConfig (line 30) | class TextEncoderArchConfig(EncoderArchConfig):
method __post_init__ (line 49) | def __post_init__(self) -> None:
class ImageEncoderArchConfig (line 58) | class ImageEncoderArchConfig(EncoderArchConfig):
class BaseEncoderOutput (line 63) | class BaseEncoderOutput:
class EncoderConfig (line 72) | class EncoderConfig(ModelConfig):
class TextEncoderConfig (line 81) | class TextEncoderConfig(EncoderConfig):
class ImageEncoderConfig (line 91) | class ImageEncoderConfig(EncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/clip.py
function _is_transformer_layer (line 15) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 19) | def _is_embeddings(n: str, m) -> bool:
class CLIPTextArchConfig (line 24) | class CLIPTextArchConfig(TextEncoderArchConfig):
class CLIPVisionArchConfig (line 61) | class CLIPVisionArchConfig(ImageEncoderArchConfig):
class CLIPTextConfig (line 87) | class CLIPTextConfig(TextEncoderConfig):
class CLIPVisionConfig (line 96) | class CLIPVisionConfig(ImageEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/gemma2.py
function _is_transformer_layer (line 19) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 23) | def _is_embeddings(n: str, m) -> bool:
function _is_final_norm (line 27) | def _is_final_norm(n: str, m) -> bool:
class Gemma2ArchConfig (line 32) | class Gemma2ArchConfig(TextEncoderArchConfig):
class Gemma2Config (line 85) | class Gemma2Config(TextEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/gemma_3.py
function _is_transformer_layer (line 13) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 17) | def _is_embeddings(n: str, m) -> bool:
function _is_final_norm (line 21) | def _is_final_norm(n: str, m) -> bool:
class Gemma3ArchConfig (line 26) | class Gemma3ArchConfig(TextEncoderArchConfig):
class Gemma3Config (line 78) | class Gemma3Config(TextEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/llama.py
function _is_transformer_layer (line 12) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 16) | def _is_embeddings(n: str, m) -> bool:
function _is_final_norm (line 20) | def _is_final_norm(n: str, m) -> bool:
class LlamaArchConfig (line 25) | class LlamaArchConfig(TextEncoderArchConfig):
class LlamaConfig (line 66) | class LlamaConfig(TextEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/qwen3.py
function _is_transformer_layer (line 12) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 16) | def _is_embeddings(n: str, m) -> bool:
function _is_final_norm (line 20) | def _is_final_norm(n: str, m) -> bool:
class Qwen3TextArchConfig (line 25) | class Qwen3TextArchConfig(TextEncoderArchConfig):
method __post_init__ (line 72) | def __post_init__(self) -> None:
class Qwen3TextConfig (line 82) | class Qwen3TextConfig(TextEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/qwen_image.py
function _is_transformer_layer (line 12) | def _is_transformer_layer(n: str, m) -> bool:
function _is_embeddings (line 16) | def _is_embeddings(n: str, m) -> bool:
function _is_final_norm (line 20) | def _is_final_norm(n: str, m) -> bool:
class QwenImageArchConfig (line 25) | class QwenImageArchConfig(TextEncoderArchConfig):
class Qwen2_5VLConfig (line 66) | class Qwen2_5VLConfig(TextEncoderConfig):
FILE: python/sglang/multimodal_gen/configs/models/encoders/t5.py
Copy disabled (too large)
Download .json
Condensed preview — 4115 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,553K chars).
[
{
"path": ".claude/skills/add-jit-kernel/SKILL.md",
"chars": 21658,
"preview": "---\nname: add-jit-kernel\ndescription: Step-by-step tutorial for adding a new lightweight JIT CUDA kernel to sglang's jit"
},
{
"path": ".claude/skills/add-sgl-kernel/SKILL.md",
"chars": 11718,
"preview": "---\nname: add-sgl-kernel\ndescription: Step-by-step tutorial for adding a heavyweight AOT CUDA/C++ kernel to sgl-kernel ("
},
{
"path": ".claude/skills/sglang-bisect-ci-regression/SKILL.md",
"chars": 9684,
"preview": "# SGLang Bisect CI Regression\n\nInvestigate a consistently failing CI test to find the root cause - whether it's a code r"
},
{
"path": ".claude/skills/write-sglang-test/SKILL.md",
"chars": 8299,
"preview": "---\nname: write-sglang-test\ndescription: Guide for writing SGLang CI/UT tests following project conventions. Covers Cust"
},
{
"path": ".codespellrc",
"chars": 156,
"preview": "[codespell]\nignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS\n"
},
{
"path": ".coveragerc",
"chars": 250,
"preview": "[run]\nsource = python/sglang/srt\nomit =\n */test/*\n */__pycache__/*\n\n[report]\nshow_missing = true\nexclude_lines =\n "
},
{
"path": ".devcontainer/Dockerfile",
"chars": 1243,
"preview": "FROM lmsysorg/sglang:dev\n\n# Create non-root user with specified UID and GID\n# NOTE: Replace with your own UID and GID. T"
},
{
"path": ".devcontainer/devcontainer.json",
"chars": 949,
"preview": "{\n \"name\": \"sglang\",\n \"build\": {\n \"dockerfile\": \"Dockerfile\"\n },\n \"remoteUser\": \"devuser\",\n \"custo"
},
{
"path": ".github/CI_PERMISSIONS.json",
"chars": 36439,
"preview": "{\n \"1pikachu\": {\n \"can_tag_run_ci_label\": true,\n \"can_rerun_failed_ci\": true,\n \"cooldown_interva"
},
{
"path": ".github/CODEOWNERS",
"chars": 4509,
"preview": ".github @merrymercy @Fridge003 @ispobock @Kangyan-Zhou @bingxche\n/docker @Fridge003 @ispobock @HaiShaw @ishandhanani @yc"
},
{
"path": ".github/FOLDER_README.md",
"chars": 494,
"preview": "# Maintenance Tools\n\nThis folder contains tools and workflows for automating maintenance tasks.\n\n## CI Permissions\n\n`CI_"
},
{
"path": ".github/ISSUE_TEMPLATE/1-bug-report.yml",
"chars": 1252,
"preview": "name: 🐞 Bug report\ndescription: Report a bug to help us reproduce and fix it.\ntitle: \"[Bug] \"\nlabels: ['Bug']\n\nbody:\n- t"
},
{
"path": ".github/ISSUE_TEMPLATE/2-feature-request.yml",
"chars": 736,
"preview": "name: 🚀 Feature request\ndescription: Suggest an idea for this project\ntitle: \"[Feature] \"\n\nbody:\n- type: checkboxes\n at"
},
{
"path": ".github/MAINTAINER.md",
"chars": 5034,
"preview": "# SGLang Code Maintenance Model\nThis document describes the code maintenance model for the SGLang project.\nSince SGLang "
},
{
"path": ".github/actions/upload-cuda-coredumps/action.yml",
"chars": 914,
"preview": "name: Upload CUDA Coredumps\ndescription: Upload CUDA coredump files as artifacts and clean up the directory.\n\ninputs:\n "
},
{
"path": ".github/actions/wait-for-jobs/action.yml",
"chars": 4710,
"preview": "name: Wait for Jobs\ndescription: Poll and wait for specified jobs in the current workflow run to complete\n\ninputs:\n sta"
},
{
"path": ".github/labeler.yml",
"chars": 2524,
"preview": "# Configuration for the GitHub Labeler action\n# Automatically adds labels to PRs based on the files changed\n\n# Router sp"
},
{
"path": ".github/pull_request_template.md",
"chars": 2196,
"preview": "<!-- Thank you for your contribution! Please follow these guidelines to enhance your pull request. If anything is unclea"
},
{
"path": ".github/update_ci_permission.py",
"chars": 8045,
"preview": "\"\"\"\nUpdate the CI permissions configuration file.\n\nThis script updates the `CI_PERMISSIONS.json` file, which defines the"
},
{
"path": ".github/workflows/amd-aiter-scout.yml",
"chars": 6797,
"preview": "name: AMD AITER Scout\n\non:\n schedule:\n - cron: '0 20 * * 1' # Monday 20:00 UTC\n - cron: '0 20 * * 4' # Thursd"
},
{
"path": ".github/workflows/amd-ci-job-monitor.yml",
"chars": 4618,
"preview": "name: AMD CI Job Monitor\n\non:\n schedule:\n - cron: '0 0 * * *' # Daily at midnight UTC\n pull_request:\n paths:\n "
},
{
"path": ".github/workflows/auto-tune.yml",
"chars": 129,
"preview": "name: Auto tune\n\non:\n workflow_dispatch:\n\njobs:\n lint:\n runs-on: ubuntu-latest\n steps:\n - uses: actions/che"
},
{
"path": ".github/workflows/bot-bump-flashinfer-version.yml",
"chars": 1487,
"preview": "name: Bot Bump Flashinfer Version\n\non:\n workflow_dispatch:\n inputs:\n new_version:\n description: 'New fla"
},
{
"path": ".github/workflows/bot-bump-kernel-version-to-sglang.yml",
"chars": 3432,
"preview": "name: Bot Bump Kernel Version to SGLang\n\non:\n workflow_dispatch:\n\npermissions:\n contents: write\n pull-requests: write"
},
{
"path": ".github/workflows/bot-bump-kernel-version.yml",
"chars": 1466,
"preview": "name: Bot Bump Kernel Version\n\non:\n workflow_dispatch:\n inputs:\n new_version:\n description: 'New sgl-ker"
},
{
"path": ".github/workflows/bot-bump-sglang-version.yml",
"chars": 2631,
"preview": "name: Bot Bump SGLang Version\n\non:\n workflow_dispatch:\n inputs:\n new_version:\n description: 'New SGLang "
},
{
"path": ".github/workflows/bot-cherry-pick.yml",
"chars": 6872,
"preview": "name: Bot Cherry Pick to Release Branch\n\non:\n workflow_dispatch:\n inputs:\n commit_sha:\n description: 'Co"
},
{
"path": ".github/workflows/cancel-pr-workflow-on-merge.yml",
"chars": 491,
"preview": "name: Cancel PR Workflows on Merge\n\non:\n pull_request_target:\n types:\n - closed\n\npermissions:\n actions: write\n"
},
{
"path": ".github/workflows/cancel-unfinished-pr-tests.yml",
"chars": 5395,
"preview": "name: Cancel Unfinished PR Runs\n\non:\n workflow_dispatch:\n inputs:\n workflows:\n description: 'Space-separ"
},
{
"path": ".github/workflows/ci-coverage-overview.yml",
"chars": 4754,
"preview": "name: CI Coverage Overview\n\non:\n schedule:\n - cron: '0 6 * * *' # Daily at 6 AM UTC\n pull_request:\n paths:\n "
},
{
"path": ".github/workflows/ci-failure-monitor.yml",
"chars": 2113,
"preview": "name: CI Failure Monitor\n\non:\n schedule:\n - cron: '0 */12 * * *' # Every 12 hour\n workflow_dispatch:\n\nconcurrency:\n"
},
{
"path": ".github/workflows/close-inactive-issues.yml",
"chars": 3627,
"preview": "name: Close Inactive Issues\n\non:\n schedule:\n - cron: '0 0 * * *'\n workflow_dispatch:\n\npermissions:\n issues: write\n"
},
{
"path": ".github/workflows/diffusion-ci-gt-gen.yml",
"chars": 3509,
"preview": "name: Diffusion CI Ground Truth Generation\n\non:\n workflow_dispatch:\n inputs:\n ref:\n description: 'Git re"
},
{
"path": ".github/workflows/execute-notebook.yml",
"chars": 1964,
"preview": "name: Execute Notebooks\n\non:\n pull_request:\n branches: [ main ]\n types: [opened, synchronize, reopened, labeled]\n"
},
{
"path": ".github/workflows/labeler.yml",
"chars": 419,
"preview": "name: Auto Label PRs\n\non:\n pull_request_target:\n types: [opened, synchronize, reopened]\n\npermissions:\n contents: re"
},
{
"path": ".github/workflows/lint.yml",
"chars": 778,
"preview": "name: Lint\n\non:\n push:\n branches: [main]\n pull_request:\n branches: [main]\n\njobs:\n lint:\n runs-on: ubuntu-lat"
},
{
"path": ".github/workflows/list-active-pr-runs.yml.yml",
"chars": 12004,
"preview": "name: List Active Runs\n\non:\n workflow_dispatch:\n inputs:\n workflows:\n description: 'Space-separated list"
},
{
"path": ".github/workflows/nightly-release-gateway.yml",
"chars": 7255,
"preview": "# Nightly release workflow for SGLang Model Gateway\n\nname: Nightly Release SGLang Model Gateway to PyPI\n\non:\n schedule:"
},
{
"path": ".github/workflows/nightly-test-amd-rocm720.yml",
"chars": 68989,
"preview": "name: Nightly Test (AMD ROCm 7.2)\n\non:\n schedule:\n - cron: '30 17 * * *'\n push:\n branches:\n - main\n path"
},
{
"path": ".github/workflows/nightly-test-amd.yml",
"chars": 66198,
"preview": "name: Nightly Test (AMD)\n\non:\n schedule:\n - cron: '30 17 * * *'\n push:\n branches:\n - main\n paths:\n "
},
{
"path": ".github/workflows/nightly-test-intel.yml",
"chars": 767,
"preview": "name: Nightly Test (Intel)\n\non:\n schedule:\n - cron: '0 0 * * *'\n push:\n branches:\n - main\n paths:\n "
},
{
"path": ".github/workflows/nightly-test-npu.yml",
"chars": 16269,
"preview": "name: Nightly Test (NPU)\n\non:\n schedule:\n - cron: '0 17 * * *' # Execute at 1:00 a.m. Beijing Time every day\n pull"
},
{
"path": ".github/workflows/nightly-test-nvidia.yml",
"chars": 22591,
"preview": "name: Nightly Test (Nvidia)\n\non:\n schedule:\n - cron: '0 0 * * *'\n workflow_dispatch:\n inputs:\n job_filter:\n"
},
{
"path": ".github/workflows/open-pr-copy-from-oss.yml",
"chars": 595,
"preview": "name: Open A PR to Copy Code From OSS\n\non:\n workflow_dispatch:\n # schedule:\n # - cron: '0 10 * * *'\n\npermissions:\n "
},
{
"path": ".github/workflows/open-pr-copy-to-oss.yml",
"chars": 763,
"preview": "name: Open A PR to Copy Diff To OSS\n\non:\n workflow_dispatch:\n inputs:\n commit_sha:\n description: 'The co"
},
{
"path": ".github/workflows/patch-docker-dev.yml",
"chars": 4274,
"preview": "name: Patch Docker Image\n\non:\n workflow_dispatch:\n inputs:\n pr_numbers:\n description: \"Comma-separated P"
},
{
"path": ".github/workflows/pr-benchmark-rust.yml",
"chars": 6186,
"preview": "name: PR Benchmark (SMG Components)\n\non:\n push:\n branches: [ main ]\n paths:\n - \"sgl-model-gateway/**\"\n pull"
},
{
"path": ".github/workflows/pr-gate.yml",
"chars": 11451,
"preview": "on:\n workflow_call:\n inputs:\n require-run-ci:\n description: \"Whether the PR must have the run-ci label\"\n"
},
{
"path": ".github/workflows/pr-test-amd-rocm720.yml",
"chars": 41071,
"preview": "name: PR Test ROCm 7.2 (AMD)\n# Dynamic run-name for /rerun-stage commands to enable URL lookup\n# Format: \"[stage-name] s"
},
{
"path": ".github/workflows/pr-test-amd.yml",
"chars": 40641,
"preview": "name: PR Test (AMD)\n# Dynamic run-name for /rerun-stage commands to enable URL lookup\n# Format: \"[stage-name] sha\" for f"
},
{
"path": ".github/workflows/pr-test-npu.yml",
"chars": 17634,
"preview": "name: PR Test (NPU)\r\n\r\non:\r\n push:\r\n branches: [ main ]\r\n pull_request:\r\n branches: [ main ]\r\n workflow_dispatc"
},
{
"path": ".github/workflows/pr-test-rust.yml",
"chars": 11895,
"preview": "name: PR Test (SMG)\n\non:\n push:\n branches: [ main ]\n paths:\n - \"sgl-model-gateway/**\"\n pull_request:\n br"
},
{
"path": ".github/workflows/pr-test-xeon.yml",
"chars": 4368,
"preview": "name: PR Test (Xeon)\n\non:\n push:\n branches: [ main ]\n pull_request:\n branches: [ main ]\n workflow_dispatch:\n w"
},
{
"path": ".github/workflows/pr-test-xpu.yml",
"chars": 5161,
"preview": "name: PR Test (XPU)\n\non:\n push:\n branches: [ main ]\n pull_request:\n branches: [ main ]\n workflow_dispatch:\n wo"
},
{
"path": ".github/workflows/pr-test.yml",
"chars": 64049,
"preview": "name: PR Test\n# Dynamic run-name for /rerun-stage commands to enable URL lookup\n# Format: \"[stage-name] sha\" for fork PR"
},
{
"path": ".github/workflows/release-branch-cut.yml",
"chars": 7397,
"preview": "name: Release Branch Cut\n\non:\n workflow_dispatch:\n inputs:\n branch_name:\n description: 'Branch name to c"
},
{
"path": ".github/workflows/release-docker-amd-nightly.yml",
"chars": 6170,
"preview": "name: Release Docker Images Nightly (AMD)\non:\n workflow_dispatch:\n schedule:\n - cron: '0 12 * * *'\n\nconcurrency:\n "
},
{
"path": ".github/workflows/release-docker-amd-rocm720-nightly.yml",
"chars": 3605,
"preview": "name: Release Docker Images ROCm 7.2.0 Nightly Preview (AMD)\non:\n workflow_dispatch:\n schedule:\n - cron: '0 12 * * "
},
{
"path": ".github/workflows/release-docker-amd.yml",
"chars": 2974,
"preview": "name: Release Docker Images (AMD)\non:\n push:\n tags:\n - 'v[0-9]+.*'\n workflow_dispatch:\n inputs:\n versi"
},
{
"path": ".github/workflows/release-docker-cu13-framework.yml",
"chars": 6364,
"preview": "name: Release CUDA 13 Framework Docker Images (Temporary)\n\n# Temporary workflow to build only versioned cu13 framework i"
},
{
"path": ".github/workflows/release-docker-dev.yml",
"chars": 7147,
"preview": "name: Build and Push Development Docker Images\n\non:\n workflow_dispatch:\n inputs:\n pr_number:\n descriptio"
},
{
"path": ".github/workflows/release-docker-gateway.yml",
"chars": 1098,
"preview": "name: Release SGLang Model Gateway Docker Image\non:\n push:\n branches:\n - main\n paths:\n - sgl-model-gate"
},
{
"path": ".github/workflows/release-docker-npu-nightly.yml",
"chars": 2787,
"preview": "name: Release Docker Images Nightly (NPU)\non:\n pull_request:\n branches:\n - 'main'\n paths:\n - '.github/w"
},
{
"path": ".github/workflows/release-docker-npu.yml",
"chars": 3181,
"preview": "name: Release Docker Images (NPU)\non:\n push:\n tags:\n - 'v[0-9]+.*'\n workflow_dispatch:\n inputs:\n versi"
},
{
"path": ".github/workflows/release-docker-xeon.yml",
"chars": 1730,
"preview": "name: Release Docker Xeon Images\non:\n push:\n tags:\n - 'v[0-9]+.*'\n workflow_dispatch:\n inputs:\n versio"
},
{
"path": ".github/workflows/release-docker.yml",
"chars": 16437,
"preview": "name: Release Docker Images\n#\n# This workflow builds and publishes both framework and runtime Docker images:\n#\n# Framewo"
},
{
"path": ".github/workflows/release-docs.yml",
"chars": 3065,
"preview": "name: Release Documentation\n\non:\n release:\n types: [published]\n push:\n branches:\n - main\n paths:\n -"
},
{
"path": ".github/workflows/release-pypi-gateway.yml",
"chars": 5386,
"preview": "name: Release SGLang Model Gateway to PyPI\n\non:\n push:\n branches:\n - main\n paths:\n - sgl-model-gateway/"
},
{
"path": ".github/workflows/release-pypi-nightly.yml",
"chars": 5957,
"preview": "name: Release PyPI Nightly Wheels\n\non:\n # Run daily at 2 AM UTC\n schedule:\n - cron: '0 2 * * *'\n # Triggered by ni"
},
{
"path": ".github/workflows/release-pypi-pr.yml",
"chars": 6932,
"preview": "name: Release PyPI PR Wheels\n\non:\n workflow_dispatch:\n inputs:\n pr_number:\n description: 'PR number to b"
},
{
"path": ".github/workflows/release-pypi.yml",
"chars": 797,
"preview": "name: Release PyPI\non:\n push:\n tags:\n - 'v[0-9]+.*'\n workflow_dispatch:\n\njobs:\n publish:\n if: github.repos"
},
{
"path": ".github/workflows/release-tag.yml",
"chars": 2181,
"preview": "name: Release Tag\n# Creates a git tag to trigger release workflows (PyPI, Docker)\n# Use this after testing on a release "
},
{
"path": ".github/workflows/release-whl-kernel.yml",
"chars": 13759,
"preview": "name: Release SGLang Kernels\n\non:\n push:\n branches:\n - main\n paths:\n - sgl-kernel/python/sgl_kernel/ver"
},
{
"path": ".github/workflows/rerun-ut.yml",
"chars": 2415,
"preview": "name: Rerun UT\nrun-name: ${{ inputs.pr_head_sha && format('[rerun-ut] {0}', inputs.pr_head_sha) || '[rerun-ut]' }}\n\non:\n"
},
{
"path": ".github/workflows/retag-docker.yml",
"chars": 895,
"preview": "name: Retag Docker Image\n\non:\n workflow_dispatch:\n inputs:\n source_tag:\n description: \"Existing image ta"
},
{
"path": ".github/workflows/runner-utilization.yml",
"chars": 1152,
"preview": "name: Runner Utilization Report\n\non:\n schedule:\n - cron: '0 8 * * *' # Daily at 8 AM UTC\n pull_request:\n paths:"
},
{
"path": ".github/workflows/slash-command-handler.yml",
"chars": 3271,
"preview": "name: Slash Command Handler\n\non:\n issue_comment:\n types: [created, edited]\n\npermissions:\n contents: read\n pull-req"
},
{
"path": ".github/workflows/stress-test.yml",
"chars": 1076,
"preview": "name: Stress Test\n\non:\n workflow_dispatch:\n inputs:\n num_prompts:\n description: 'Number of prompts per m"
},
{
"path": ".github/workflows/weekly-test-nvidia.yml",
"chars": 1344,
"preview": "name: Weekly Test (Nvidia)\n\non:\n schedule:\n - cron: '0 0 * * 0' # Run every Sunday at midnight UTC\n workflow_dispa"
},
{
"path": ".gitignore",
"chars": 4368,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": ".isort.cfg",
"chars": 50,
"preview": "[settings]\nprofile=black\nknown_first_party=sglang\n"
},
{
"path": ".pre-commit-config.yaml",
"chars": 3313,
"preview": "default_stages: [pre-commit, pre-push, manual]\nexclude: ^(python/sglang/multimodal_gen/csrc|python/sglang/jit_kernel/fla"
},
{
"path": "3rdparty/amd/profiling/PROFILING.md",
"chars": 17197,
"preview": "## Profiling SGLang Infer System with AMD GPUs\nThis AppNote describes the SGLang profiling technical, code augment and r"
},
{
"path": "3rdparty/amd/profiling/client.sh",
"chars": 964,
"preview": "#!/bin/bash\n\n# Start profiling via API\ncurl http://localhost:30000/start_profile -H \"Content-Type: application/json\"\n\n# "
},
{
"path": "3rdparty/amd/profiling/install_rpd.sh",
"chars": 367,
"preview": "# download and install RPD\napt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev\n\n# install rpd module\ngit clon"
},
{
"path": "3rdparty/amd/profiling/loadTracer.sh",
"chars": 1749,
"preview": "#!/bin/bash\n################################################################################\n# Copyright (c) 2021 - 2023"
},
{
"path": "3rdparty/amd/profiling/rpd.patch",
"chars": 568,
"preview": "diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile\nindex e9d9feb..b2e9e1a 100644\n--- a/rpd_tracer/Makefile\n+++ b/rpd"
},
{
"path": "3rdparty/amd/profiling/rpd_profile_server_enable.patch",
"chars": 1849,
"preview": "diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py\nindex 62d1ff9..9021c01 10"
},
{
"path": "3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch",
"chars": 4559,
"preview": "diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py\nindex 62d1ff9..2edb427 10"
},
{
"path": "3rdparty/amd/profiling/server.sh",
"chars": 615,
"preview": "#!/bin/bash\n\n# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/\nexport SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/pro"
},
{
"path": "3rdparty/amd/profiling/torch_profiler.patch",
"chars": 1182,
"preview": "diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py\r\nindex 62d1ff9..6ecd78c 1"
},
{
"path": "3rdparty/amd/tuning/TUNING.md",
"chars": 6436,
"preview": "## Tuning SGLang Infer System with AMD GPUs\nThis AppNote describes the SGLang performance tuning technical, code harness"
},
{
"path": "3rdparty/amd/tuning/benchmark_moe_rocm.py",
"chars": 12338,
"preview": "import argparse\nimport json\nimport os\nimport sys\n\nimport torch\nimport torch.nn.functional as F\nimport triton\nimport trit"
},
{
"path": "3rdparty/amd/wheel/README.md",
"chars": 3918,
"preview": "# sglang-kernel (prior sgl-kernel)\n\nBuilding and releasing `sglang-kernel` as a wheel is a part of the release workflow."
},
{
"path": "3rdparty/amd/wheel/sgl-kernel/CMakeLists_rocm.txt",
"chars": 4913,
"preview": "cmake_minimum_required(VERSION 3.24 FATAL_ERROR)\nproject(sgl_kernel LANGUAGES CXX)\n\n# Cmake\nset(CMAKE_CXX_STANDARD 17)\ns"
},
{
"path": "3rdparty/amd/wheel/sgl-kernel/build_rocm.sh",
"chars": 2917,
"preview": "#!/bin/bash\nset -euo pipefail\n\nROCM_VERSION=${1:-}\n\nif [[ \"${ROCM_VERSION}\" == \"700\" ]]; then\n IMAGE=\"lmsysorg/sglang:v"
},
{
"path": "3rdparty/amd/wheel/sgl-kernel/rename_wheels_rocm.sh",
"chars": 881,
"preview": "#!/usr/bin/env bash\nset -ex\n\nWHEEL_DIR=\"dist\"\n\nwheel_files=($WHEEL_DIR/*.whl)\nfor wheel in \"${wheel_files[@]}\"; do\n i"
},
{
"path": "3rdparty/amd/wheel/sgl-kernel/rocm_hipify.py",
"chars": 1079,
"preview": "from pathlib import Path\n\nimport torch\nfrom torch.utils.cpp_extension import CUDAExtension\n\nroot = Path(__file__).parent"
},
{
"path": "3rdparty/amd/wheel/sglang/pyproject.toml",
"chars": 5744,
"preview": "[build-system]\nrequires = [\"setuptools>=61.0\", \"setuptools-scm>=8.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n"
},
{
"path": "LICENSE",
"chars": 11346,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 10497,
"preview": "<div align=\"center\" id=\"sglangtop\">\n<img src=\"https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png\""
},
{
"path": "benchmark/asr/README.md",
"chars": 6196,
"preview": "# ASR Benchmark\n\nThis benchmark evaluates the performance and accuracy (Word Error Rate - WER) of Automatic Speech Recog"
},
{
"path": "benchmark/asr/bench_sglang.py",
"chars": 12120,
"preview": "import argparse\nimport asyncio\nimport base64\nimport io\nimport json\nimport time\nfrom statistics import mean, median\n\nimpo"
},
{
"path": "benchmark/bench_attention_sink/bench_attention_sink_triton.py",
"chars": 7492,
"preview": "import argparse\n\nimport torch\nimport triton\n\nfrom sglang.srt.layers.attention.triton_ops.decode_attention import (\n d"
},
{
"path": "benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py",
"chars": 3997,
"preview": "# Benchmark with lots of common prefixes. Used to benchmark prefix caching performance.\n#\n# Launch a server:\n# python -m"
},
{
"path": "benchmark/bench_linear_attention/bench_gdn_decode.py",
"chars": 14779,
"preview": "\"\"\"\nBenchmark & Correctness: GDN Packed Decode vs Baseline Decode.\n\nCompares:\n - Baseline: split(mixed_qkv) → view → fu"
},
{
"path": "benchmark/bench_linear_attention/bench_gdn_prefill.py",
"chars": 19679,
"preview": "\"\"\"\nBenchmark & Correctness: Triton GDN vs FlashInfer GDN (prefill).\n\nCompares:\n - Triton: sglang's chunk_gated_del"
},
{
"path": "benchmark/bench_rope/benchmark_rope_index.py",
"chars": 13626,
"preview": "# This script benchmarks MRotaryEmbedding.get_rope_index_glm4v (GLM4V mrope index builder).\n# It generates synthetic mul"
},
{
"path": "benchmark/benchmark_batch/benchmark_batch.py",
"chars": 7241,
"preview": "import concurrent.futures\nimport os\nimport random\nimport time\nfrom concurrent.futures import ProcessPoolExecutor\nfrom st"
},
{
"path": "benchmark/benchmark_batch/benchmark_tokenizer.py",
"chars": 6925,
"preview": "import argparse\nimport random\nimport time\nfrom statistics import mean\n\nfrom transformers import AutoTokenizer\n\nfrom sgla"
},
{
"path": "benchmark/benchmark_vllm_060/README.md",
"chars": 5155,
"preview": "## How to reproduce the benchmark results for SGLang v0.3.0 compared to vLLM v0.6.0\n\nIn short, with multi step enabled, "
},
{
"path": "benchmark/blog_v0_2/405b_sglang.sh",
"chars": 2262,
"preview": "# Create dummy weights:\n# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under t"
},
{
"path": "benchmark/blog_v0_2/405b_trt.sh",
"chars": 2259,
"preview": "# Launch trtllm\n# https://github.com/sgl-project/tensorrt-demo\n\n# offline\npython3 ../../python/sglang/bench_serving.py -"
},
{
"path": "benchmark/blog_v0_2/405b_vllm.sh",
"chars": 2381,
"preview": "# Create dummy weights:\n# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under t"
},
{
"path": "benchmark/blog_v0_2/README.md",
"chars": 11199,
"preview": "# How to reproduce the benchmark results of SGLang\n\n## Prerequisite\n\n### Install the latest SGLang\n\n```bash\ngit clone ht"
},
{
"path": "benchmark/blog_v0_2/config.md",
"chars": 2374,
"preview": "### used for TensorRT LLM\n\n```\n{\n \"architecture\": \"LlamaForCausalLM\",\n \"dtype\": \"float16\",\n \"logits_dtype\": \"fl"
},
{
"path": "benchmark/boolq/README.md",
"chars": 308,
"preview": "## Download data\n```\ngit clone https://hf-mirror.com/datasets/google/boolq\n```\n\n## Convert parquet to json\n```\nbash parq"
},
{
"path": "benchmark/boolq/bench_sglang.py",
"chars": 3478,
"preview": "import argparse\nimport json\nimport time\n\nimport numpy as np\n\nfrom sglang.api import set_default_backend\nfrom sglang.test"
},
{
"path": "benchmark/boolq/convert_parquet_to_json.py",
"chars": 661,
"preview": "import sys\n\nimport pyarrow.parquet as pq\n\n\ndef convert_parquet_to_json(input_file, output_file):\n # read parquet file"
},
{
"path": "benchmark/boolq/parquet_to_json.sh",
"chars": 687,
"preview": "#!/bin/bash\n\n#define input and output direction\ninput_dir=\"./boolq/data\"\noutput_dir=\"./boolq/data\"\n\n#define files needed"
},
{
"path": "benchmark/ceval/README.md",
"chars": 258,
"preview": "## Download data\n```\ngit lfs clone https://huggingface.co/datasets/ceval/ceval-exam\n```\n\n## Run benchmark\n\n### Benchmark"
},
{
"path": "benchmark/ceval/bench_sglang.py",
"chars": 3915,
"preview": "import argparse\nimport json\nimport os\nimport random\nimport re\nimport time\n\nimport numpy as np\nfrom datasets import load_"
},
{
"path": "benchmark/deepseek_v3/README.md",
"chars": 19349,
"preview": "# DeepSeek V3.1/V3/R1 Support\n\nThe SGLang and DeepSeek teams collaborated to get DeepSeek V3 FP8 running on NVIDIA and A"
},
{
"path": "benchmark/dspy/README.md",
"chars": 1048,
"preview": "## Install\n\n```\npip3 install dspy-ai\n```\n\nTurn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037a"
},
{
"path": "benchmark/dspy/bench_dspy_intro.py",
"chars": 6472,
"preview": "\"\"\"\nAdapted from\nhttps://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9\n\"\"\"\n\ni"
},
{
"path": "benchmark/fla/benchmark_layernorm_gated.py",
"chars": 10033,
"preview": "from typing import Optional\n\nimport numpy as np\nimport torch\n\n# Import the function to benchmark\nfrom sglang.srt.layers."
},
{
"path": "benchmark/generative_agents/README.md",
"chars": 954,
"preview": "## Download the dataset\n\n```\nwget -O agent_calls.jsonl https://drive.google.com/uc?export=download&id=19qLpD45e9JGTKF2cU"
},
{
"path": "benchmark/generative_agents/agent_functions.py",
"chars": 11214,
"preview": "import sglang as sgl\n\n# here are the top five agent functions contributing ~70% LLM calls\n# reference: https://github.co"
},
{
"path": "benchmark/generative_agents/bench_other.py",
"chars": 2484,
"preview": "import argparse\nimport json\nimport time\n\nfrom agent_functions import (\n action_location_object_prompt,\n action_loc"
},
{
"path": "benchmark/generative_agents/bench_sglang.py",
"chars": 2152,
"preview": "import argparse\nimport json\nimport time\n\nfrom agent_functions import (\n action_location_object,\n action_location_s"
},
{
"path": "benchmark/gpt_oss/README.md",
"chars": 5404,
"preview": "# How to reproduce the result of GPT-OSS with SGLang\n\n### Install the latest SGLang\n\n```bash\ngit clone https://github.co"
},
{
"path": "benchmark/gsm8k/README.md",
"chars": 1475,
"preview": "## Run benchmark\n\n### Using GSM8K Platinum\n\nGSM8K Platinum is a revised version of the GSM8K test set with corrected lab"
},
{
"path": "benchmark/gsm8k/bench_other.py",
"chars": 5114,
"preview": "import argparse\nimport ast\nimport asyncio\nimport json\nimport re\nimport time\nfrom concurrent.futures import ThreadPoolExe"
},
{
"path": "benchmark/gsm8k/bench_sglang.py",
"chars": 6108,
"preview": "import argparse\nimport ast\nimport json\nimport os\nimport re\nimport time\n\nimport numpy as np\nfrom datasets import load_dat"
},
{
"path": "benchmark/hellaswag/README.md",
"chars": 1037,
"preview": "## Run benchmark\n\n### Benchmark sglang\n```\npython -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --p"
},
{
"path": "benchmark/hellaswag/bench_other.py",
"chars": 3686,
"preview": "import argparse\nimport asyncio\nimport json\nimport time\nfrom concurrent.futures import ThreadPoolExecutor\n\nimport numpy a"
},
{
"path": "benchmark/hellaswag/bench_sglang.py",
"chars": 3240,
"preview": "import argparse\nimport json\nimport os\nimport time\n\nimport numpy as np\n\nfrom sglang.lang.api import set_default_backend\nf"
},
{
"path": "benchmark/hf3fs/bench.sh",
"chars": 2691,
"preview": "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/dist-packages/"
},
{
"path": "benchmark/hf3fs/bench_client.py",
"chars": 4832,
"preview": "import concurrent.futures\nimport logging\nimport random\nimport time\nfrom typing import List\n\nimport torch\nfrom tqdm impor"
},
{
"path": "benchmark/hf3fs/bench_storage.py",
"chars": 7297,
"preview": "import json\nimport logging\nimport os\nimport random\nimport time\nfrom typing import List\n\nimport torch\nfrom tqdm import tq"
},
{
"path": "benchmark/hf3fs/bench_zerocopy.py",
"chars": 3585,
"preview": "import threading\nimport time\n\nimport torch\nfrom tqdm import tqdm\n\nfrom sglang.srt.distributed import (\n get_world_gro"
},
{
"path": "benchmark/hicache/README.md",
"chars": 3952,
"preview": "## Run synthetic multi-turn benchmark\n\n```\n# SGLang server with radix cache disabled\npython -m sglang.launch_server --mo"
},
{
"path": "benchmark/hicache/bench_long_context.py",
"chars": 3496,
"preview": "import json\nimport queue\nimport time\n\nimport requests\nfrom bench_multiturn import (\n ReadyQueue,\n WorkloadGenerato"
},
{
"path": "benchmark/hicache/bench_mix.py",
"chars": 19108,
"preview": "import argparse\nimport asyncio\nimport json\nimport logging\nimport os\nimport queue\nimport random\nimport threading\nimport t"
},
{
"path": "benchmark/hicache/bench_mix.sh",
"chars": 1642,
"preview": "#!/bin/bash\n\nexport LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages:/usr/local/lib/python3.12/d"
},
{
"path": "benchmark/hicache/bench_multiturn.py",
"chars": 29841,
"preview": "import argparse\nimport asyncio\nimport json\nimport queue\nimport random\nimport threading\nimport time\nfrom datetime import "
},
{
"path": "benchmark/hicache/bench_serving.py",
"chars": 37161,
"preview": "# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_req"
},
{
"path": "benchmark/hicache/data_processing.py",
"chars": 20039,
"preview": "import json\nimport os\nimport pickle\nimport random\nfrom typing import List, Optional, Tuple, Union\n\nimport numpy as np\nfr"
},
{
"path": "benchmark/hicache/download.sh",
"chars": 1891,
"preview": "#!/usr/bin/bash\n\n# The usage function\nusage() {\n echo \"Usage: $0 {sharegpt|ultragpt|loogle|nextqa|all}\"\n exit 1\n}\n"
},
{
"path": "benchmark/hicache/nextqa.py",
"chars": 5152,
"preview": "import os\nimport sys\nfrom typing import List\n\nimport av\nfrom datasets import load_dataset\n\n\ndef find_video_files(video_d"
},
{
"path": "benchmark/hicache/perf.py",
"chars": 7454,
"preview": "from __future__ import annotations\n\nfrom typing import Any, Callable, NamedTuple\n\nimport torch\n\n\ndef jit_hicache_impl(\n "
},
{
"path": "benchmark/json_decode_regex/README.md",
"chars": 1019,
"preview": "## Run benchmark\n\n### Build dataset\n```\npip install wikipedia\npython3 build_dataset.py\n```\n\n### Dependencies\n\n```\nllama_"
},
{
"path": "benchmark/json_decode_regex/bench_other.py",
"chars": 2984,
"preview": "import argparse\nimport json\nimport time\nfrom concurrent.futures import ThreadPoolExecutor\nfrom functools import partial\n"
},
{
"path": "benchmark/json_decode_regex/bench_sglang.py",
"chars": 3528,
"preview": "import argparse\nimport json\nimport time\n\nimport sglang as sgl\nfrom sglang.lang.ir import REGEX_FLOAT, REGEX_INT, REGEX_S"
},
{
"path": "benchmark/json_decode_regex/build_dataset.py",
"chars": 1282,
"preview": "import json\n\nimport transformers\nimport wikipedia\n\nmodel_path = \"meta-llama/Llama-2-7b-chat-hf\"\nt = transformers.AutoTok"
},
{
"path": "benchmark/json_jump_forward/README.md",
"chars": 1709,
"preview": "## Run benchmark\n\n### Dependencies\n\n```\nllama_cpp_python 0.2.38\nguidance 0.1.10\nvllm "
},
{
"path": "benchmark/json_jump_forward/bench_other.py",
"chars": 9562,
"preview": "import argparse\nimport json\nimport time\nfrom concurrent.futures import ThreadPoolExecutor\nfrom functools import partial\n"
},
{
"path": "benchmark/json_jump_forward/bench_sglang.py",
"chars": 4421,
"preview": "import argparse\nimport json\nimport time\n\nimport sglang as sgl\nfrom sglang.test.test_utils import (\n add_common_sglang"
},
{
"path": "benchmark/json_jump_forward/build_dataset.py",
"chars": 1282,
"preview": "import json\n\nimport transformers\nimport wikipedia\n\nmodel_path = \"meta-llama/Llama-2-7b-chat-hf\"\nt = transformers.AutoTok"
},
{
"path": "benchmark/json_jump_forward/dataset.txt",
"chars": 750,
"preview": "Harry Potter\nHermione Granger\nRon Weasley\nAlbus Dumbledore\nSeverus Snape\nRubeus Hagrid\nDraco Malfoy\nGinny Weasley\nFred W"
},
{
"path": "benchmark/json_schema/README.md",
"chars": 205,
"preview": "## Run benchmark\n\n### Benchmark sglang\n\nRun Llama-8b\n\n```bash\npython3 -m sglang.launch_server --model-path meta-llama/Ll"
},
{
"path": "benchmark/json_schema/bench_sglang.py",
"chars": 4316,
"preview": "import argparse\nimport json\nimport time\nfrom typing import List, Tuple\n\nimport jsonschema\nfrom datasets import load_data"
},
{
"path": "benchmark/kernels/all_reduce/benchmark_aiter.py",
"chars": 9790,
"preview": "\"\"\"\nBenchmark SGLang vs Aiter custom all-reduce across message sizes.\nUsage:\n torchrun --nproc_per_node=2 benchmark_a"
},
{
"path": "benchmark/kernels/all_reduce/benchmark_all_reduce.py",
"chars": 10759,
"preview": "\"\"\"\nBenchmark SGLang custom all-reduce vs Torch symm-mem all-reduce across message sizes.\nUsage:\n torchrun --nproc_pe"
},
{
"path": "benchmark/kernels/all_reduce/benchmark_fused_ar_rms_amd.py",
"chars": 17612,
"preview": "\"\"\"\nBenchmark fused allreduce+rmsnorm on AMD with correctness checks.\n\nThis script targets the same fused op used by SGL"
},
{
"path": "benchmark/kernels/all_reduce/benchmark_mscclpp.py",
"chars": 7300,
"preview": "\"\"\"For Now, MSCCL is only supported on TP16 and TP8 case\n\nexport WORLD_SIZE=1\nexport RANK=0\nexport MASTER_ADDR=127.0.0.1"
},
{
"path": "benchmark/kernels/all_reduce/benchmark_torch_symm_mem.py",
"chars": 8005,
"preview": "\"\"\"For Now, TORCH_SYMM_MEM is only supported on following limited tp case\n\nSM90: {\n 2: 64 * MiB, # 64 MB\n 4: 64 *"
},
{
"path": "benchmark/kernels/decoding_attention_triton/triton_flashinfer_cudnn.py",
"chars": 12106,
"preview": "import itertools\nimport math\n\nimport cudnn\nimport torch\nimport torch.utils.benchmark as benchmark\nfrom flashinfer import"
},
{
"path": "benchmark/kernels/deepep/deepep_utils.py",
"chars": 7334,
"preview": "# ADAPTED FROM https://github.com/deepseek-ai/DeepEP/blob/main/tests/utils.py\n\nimport os\nimport sys\nfrom typing import O"
},
{
"path": "benchmark/kernels/deepep/tuning_deepep.py",
"chars": 19961,
"preview": "# MODIFIED FROM https://github.com/deepseek-ai/DeepEP/blob/main/tests/test_internode.py\n\n\"\"\"\nExample usage:\npython tunin"
},
{
"path": "benchmark/kernels/deepseek/README.md",
"chars": 942,
"preview": "## DeepSeek kernels benchmark\n\n\n### Prerequisites\n- You should install [DeepGemm](https://github.com/deepseek-ai/DeepGEM"
},
{
"path": "benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm.py",
"chars": 13146,
"preview": "from typing import Tuple\n\nimport deep_gemm\nimport tilelang\nimport tilelang.language as T\nimport torch\nimport triton\nfrom"
},
{
"path": "benchmark/kernels/deepseek/benchmark_deepgemm_fp8_gemm_blackwell.py",
"chars": 9895,
"preview": "import argparse\nfrom typing import Tuple\n\nimport torch\nimport triton\nfrom deep_gemm import ceil_div\nfrom flashinfer.gemm"
},
{
"path": "benchmark/kernels/deepseek/benchmark_deepgemm_fp8_group_gemm.py",
"chars": 16345,
"preview": "from typing import Tuple\n\nimport deep_gemm\nimport torch\nimport triton\nimport triton.language as tl\nfrom deep_gemm import"
},
{
"path": "benchmark/kernels/elementwise/benchmark_concat_mla.py",
"chars": 5883,
"preview": "import torch\nimport triton\nimport triton.language as tl\nfrom sgl_kernel import concat_mla_k as concat_mla_k_cuda\n\nfrom s"
},
{
"path": "benchmark/kernels/flashinfer_allreduce_fusion/README.md",
"chars": 5946,
"preview": "# FlashInfer Fused AllReduce + RMSNorm Benchmark\n\nThis benchmark script is modified from the [original implementation](h"
},
{
"path": "benchmark/kernels/flashinfer_allreduce_fusion/benchmark_fused_collective.py",
"chars": 46070,
"preview": "# Modified from https://github.com/vllm-project/vllm/blob/237e1fb887c7f5a579420fa0295097f24b006594/benchmarks/kernels/be"
},
{
"path": "benchmark/kernels/fused_moe_triton/README.md",
"chars": 7891,
"preview": "## Tuning Triton MoE Kernels\n\nThis directory contains benchmarking tools for MoE (Mixture of Experts) kernels.\n\n### Over"
},
{
"path": "benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py",
"chars": 6883,
"preview": "# python3 benchmark/kernels/fused_moe_triton/sglang_fused_moe_triton.py --model /DeepSeek-V3/ --tp-size 8\nimport argpars"
},
{
"path": "benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py",
"chars": 9333,
"preview": "# python3 benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py --model /DeepSeek-V3/ --tp-size 8 --us"
},
{
"path": "benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py",
"chars": 7518,
"preview": "# python3 benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py --model /DeepSeek-V3/ --tp-siz"
},
{
"path": "benchmark/kernels/fused_moe_triton/common_utils.py",
"chars": 9412,
"preview": "import json\nfrom typing import Dict, List, TypedDict\n\nimport torch\n\nfrom sglang.srt.layers.moe.fused_moe_triton.fused_mo"
},
{
"path": "benchmark/kernels/fused_moe_triton/tuning_client.py",
"chars": 2063,
"preview": "import argparse\nimport os\nimport time\n\nimport openai\n\n\"\"\"\n# Edit the code file srt/models/deepseek_v2.py in the Python s"
},
{
"path": "benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py",
"chars": 16695,
"preview": "# Adapted from https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py\nimport argparse\nimport"
},
{
"path": "benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton_sep.py",
"chars": 29115,
"preview": "# Adapted from https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py\nimport argparse\nimport"
},
{
"path": "benchmark/kernels/fused_moe_triton/tuning_text.json",
"chars": 247081,
"preview": "{\"prompt\": \"Here are the relevant Wikipedia articles:\\nThe president of the United States (POTUS) is the head of state a"
},
{
"path": "benchmark/kernels/quantization/README.md",
"chars": 3607,
"preview": "# W8A8 Block-wise Quantization Kernel Tuning\n\nAuto-tune Triton FP8/INT8 block-wise quantization kernels for optimal perf"
},
{
"path": "benchmark/kernels/quantization/bench_fp4_quant.py",
"chars": 3940,
"preview": "import argparse\nimport itertools\n\nimport torch\nimport triton\nfrom flashinfer import (\n scaled_fp4_grouped_quantize,\n "
},
{
"path": "benchmark/kernels/quantization/bench_int8_quant.py",
"chars": 2894,
"preview": "import argparse\n\nimport torch\nimport triton\nfrom vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant\n\nfr"
},
{
"path": "benchmark/kernels/quantization/tuning_block_wise_kernel.py",
"chars": 16082,
"preview": "# Copyright 2025 SGLang Team\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this fi"
},
{
"path": "benchmark/kernels/scheduler_batch/benchmark_get_last_loc_triton.py",
"chars": 4782,
"preview": "import os\n\nimport torch\nimport triton\nimport triton.language as tl\n\nfrom sglang.benchmark.bench_utils import run_bench\n\n"
},
{
"path": "benchmark/kernels/scheduler_batch/benchmark_write_req_to_token_pool_triton.py",
"chars": 10481,
"preview": "import itertools\nimport os\n\nimport torch\nimport triton\nimport triton.language as tl\n\nfrom sglang.benchmark.bench_utils i"
},
{
"path": "benchmark/kernels/sliding_window_attention_triton/bench_triton_swa_kernel.py",
"chars": 9340,
"preview": "import itertools\n\nimport torch\nimport torch.nn.functional as F\nimport triton.testing as tt\n\nfrom sglang.benchmark.bench_"
},
{
"path": "benchmark/line_retrieval/README.md",
"chars": 745,
"preview": "## Download data\n\n```\nwget https://raw.githubusercontent.com/merrymercy/merrymercy.github.io/master/files/random_words.j"
},
{
"path": "benchmark/line_retrieval/bench_sglang.py",
"chars": 4762,
"preview": "import argparse\nimport json\nimport re\nimport time\n\nimport numpy as np\n\nimport sglang as sgl\nfrom sglang.test.test_utils "
},
{
"path": "benchmark/line_retrieval/gen_data.py",
"chars": 4906,
"preview": "\"\"\"\nGenerate line data for line retrieval task.\n\nUsage:\npython3 gen_data.py --number 1000\n\"\"\"\n\nimport argparse\nimport js"
},
{
"path": "benchmark/llava_bench/README.md",
"chars": 1729,
"preview": "## Download benchmark images\n\n```\npython3 download_images.py\n```\n\nimage benchmark source: https://huggingface.co/dataset"
},
{
"path": "benchmark/llava_bench/bench_hf_llava_bench.sh",
"chars": 248,
"preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa \\\n --model-path liuhaotian/llava-v1.5-7b \\\n --question-file ./question"
},
{
"path": "benchmark/llava_bench/bench_hf_mme.sh",
"chars": 314,
"preview": "#!/bin/bash\n\npython -m llava.eval.model_vqa_loader \\\n --model-path liuhaotian/llava-v1.5-7b \\\n --question-file ./m"
},
{
"path": "benchmark/llava_bench/bench_sglang.py",
"chars": 3043,
"preview": "import argparse\nimport json\nimport os\nimport time\n\nimport tqdm\n\nimport sglang as sgl\nfrom sglang.test.test_utils import "
}
]
// ... and 3915 more files (download for full content)
About this extraction
This page contains the full source code of the sgl-project/sglang GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 4115 files (40.1 MB), approximately 4.2M tokens, and a symbol index with 12024 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.