gitextract_4p86pot8/

├── .clang-format
├── .claude/
│   └── skills/
│       ├── check-env/
│       │   └── SKILL.md
│       ├── code-navigation/
│       │   └── SKILL.md
│       ├── resolve-review/
│       │   └── SKILL.md
│       ├── submit-pr/
│       │   └── SKILL.md
│       └── support-new-model/
│           └── SKILL.md
├── .github/
│   ├── CONTRIBUTING.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── 1-bug-report.yml
│   │   ├── 2-feature-request.yml
│   │   └── 3-documentation.yml
│   ├── pull_request_template.md
│   ├── release.yml
│   ├── scripts/
│   │   ├── action_tools.py
│   │   ├── check_lmdeploy.py
│   │   ├── doc_link_checker.py
│   │   ├── eval_base_config.py
│   │   ├── eval_chat_config.py
│   │   ├── eval_regression_base_models.py
│   │   ├── eval_regression_chat_models.py
│   │   ├── eval_stable_object_config.py
│   │   └── eval_stable_subject_config.py
│   └── workflows/
│       ├── api_eval.yml
│       ├── benchmark.yml
│       ├── cuda12.8_whl_release.yml
│       ├── daily_ete_test.yml
│       ├── daily_ete_test_3090.yml
│       ├── daily_ete_test_5080.yml
│       ├── docker.yml
│       ├── docker_dev.yml
│       ├── evaluate.yml
│       ├── lint.yml
│       ├── linux_x64_gpu.yml
│       ├── mllm_api_eval.yml
│       ├── pr_ete_test.yml
│       ├── pypi.yml
│       ├── stable.yml
│       ├── stale.yml
│       ├── test_docker.yml
│       ├── unit_test.yml
│       └── windows_x64_gpu.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── CLAUDE.md
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_ja.md
├── README_zh-CN.md
├── autotest/
│   ├── benchmark/
│   │   ├── test_apiserver_performance.py
│   │   ├── test_longtext_performance.py
│   │   ├── test_mllm_apiserver_performance.py
│   │   ├── test_prefixcache_performance.py
│   │   └── test_throughput_performance.py
│   ├── chat_prompt_case.yml
│   ├── config.yml
│   ├── config_3090.yml
│   ├── config_3090_legacy.yml
│   ├── config_5080.yml
│   ├── config_5080_legacy.yml
│   ├── config_ascend.yml
│   ├── config_h.yml
│   ├── config_h800.yml
│   ├── config_h_legacy.yml
│   ├── config_legacy.yml
│   ├── config_test.yml
│   ├── config_testascend.yml
│   ├── conftest.py
│   ├── evaluate/
│   │   ├── eval_config_chat.py
│   │   ├── test_api_evaluate.py
│   │   └── test_mllm_api_evaluate.py
│   ├── interface/
│   │   ├── pipeline/
│   │   │   ├── test_pipeline_func.py
│   │   │   └── test_pipeline_longtext_func.py
│   │   └── restful/
│   │       ├── test_restful_chat_completions_v1.py
│   │       ├── test_restful_completions_v1.py
│   │       └── test_restful_generate.py
│   ├── prompt_case.yml
│   ├── pytest.ini
│   ├── template.json
│   ├── toolchain/
│   │   └── test_lagent.py
│   ├── tools/
│   │   ├── chat/
│   │   │   ├── test_command_chat_hf_pytorch.py
│   │   │   └── test_command_chat_hf_turbomind.py
│   │   ├── common_case_config.py
│   │   ├── pipeline/
│   │   │   ├── llm_case.py
│   │   │   ├── mllm_case.py
│   │   │   ├── test_pipeline_chat_pytorch_llm.py
│   │   │   ├── test_pipeline_chat_pytorch_mllm.py
│   │   │   ├── test_pipeline_chat_turbomind_llm.py
│   │   │   └── test_pipeline_chat_turbomind_mllm.py
│   │   ├── quantization/
│   │   │   ├── test_quantization_awq.py
│   │   │   └── test_quantization_w8a8.py
│   │   └── restful/
│   │       ├── test_restful_chat_hf_pytorch_llm.py
│   │       ├── test_restful_chat_hf_pytorch_mllm.py
│   │       ├── test_restful_chat_hf_turbomind_llm.py
│   │       └── test_restful_chat_hf_turbomind_mllm.py
│   └── utils/
│       ├── benchmark_utils.py
│       ├── common_utils.py
│       ├── config_utils.py
│       ├── constant.py
│       ├── evaluate_utils.py
│       ├── get_run_config.py
│       ├── mp_log_utils.py
│       ├── pipeline_chat.py
│       ├── proxy_distributed_utils.py
│       ├── quantization_utils.py
│       ├── ray_distributed_utils.py
│       ├── restful_return_check.py
│       ├── rule_condition_assert.py
│       ├── run_client_chat.py
│       ├── run_restful_chat.py
│       └── toolkit.py
├── benchmark/
│   ├── README.md
│   ├── benchmark_decode.py
│   ├── benchmark_pipeline.py
│   ├── benchmark_serving.py
│   ├── benchmark_throughput.py
│   ├── lmdeploy.yml
│   ├── profile_pipeline_api.py
│   ├── profile_restful_api.py
│   └── profile_throughput.py
├── builder/
│   ├── manywheel/
│   │   ├── Dockerfile_2014
│   │   ├── README.md
│   │   ├── build_all_lmdeploy_builders.sh
│   │   ├── build_all_wheel.sh
│   │   ├── build_lmdeploy_builder.sh
│   │   ├── build_wheel.sh
│   │   ├── entrypoint_build.sh
│   │   └── scripts/
│   │       ├── install_conda.sh
│   │       ├── install_cuda.sh
│   │       └── install_openmpi.sh
│   └── windows/
│       ├── README.md
│       ├── generate.ps1
│       └── setup_cuda.ps1
├── cmake/
│   ├── Modules/
│   │   └── FindNCCL.cmake
│   ├── TritonTurboMindBackendConfig.cmake.in
│   ├── TurboMindConfig.cmake.in
│   └── yaml-cpp_cmake_policy.patch
├── debug.sh
├── docker/
│   ├── Dockerfile
│   ├── Dockerfile.jetson
│   ├── Dockerfile_ascend_a2_300i
│   ├── Dockerfile_ascend_a3
│   ├── Dockerfile_dev
│   ├── InternVL_Dockerfile
│   ├── Qwen2VL_Dockerfile
│   ├── build.sh
│   ├── install.sh
│   └── prepare_wheel.sh
├── docs/
│   ├── en/
│   │   ├── .readthedocs.yaml
│   │   ├── Makefile
│   │   ├── _static/
│   │   │   └── css/
│   │   │       └── readthedocs.css
│   │   ├── advance/
│   │   │   ├── chat_template.md
│   │   │   ├── context_parallel.md
│   │   │   ├── debug_turbomind.md
│   │   │   ├── long_context.md
│   │   │   ├── metrics.md
│   │   │   ├── pytorch_multinodes.md
│   │   │   ├── pytorch_multithread.md
│   │   │   ├── pytorch_new_model.md
│   │   │   ├── pytorch_profiling.md
│   │   │   ├── spec_decoding.md
│   │   │   ├── structed_output.md
│   │   │   └── update_weights.md
│   │   ├── api/
│   │   │   ├── cli.rst
│   │   │   ├── openapi.rst
│   │   │   └── pipeline.rst
│   │   ├── benchmark/
│   │   │   ├── a100_fp16.md
│   │   │   ├── benchmark.md
│   │   │   ├── evaluate_with_opencompass.md
│   │   │   └── evaluate_with_vlmevalkit.md
│   │   ├── conf.py
│   │   ├── faq.md
│   │   ├── get_started/
│   │   │   ├── ascend/
│   │   │   │   └── get_started.md
│   │   │   ├── camb/
│   │   │   │   └── get_started.md
│   │   │   ├── get_started.md
│   │   │   ├── index.rst
│   │   │   ├── installation.md
│   │   │   └── maca/
│   │   │       └── get_started.md
│   │   ├── index.rst
│   │   ├── inference/
│   │   │   ├── load_hf.md
│   │   │   ├── pytorch.md
│   │   │   ├── turbomind.md
│   │   │   └── turbomind_config.md
│   │   ├── llm/
│   │   │   ├── api_server.md
│   │   │   ├── api_server_lora.md
│   │   │   ├── api_server_reasoning.md
│   │   │   ├── api_server_tools.md
│   │   │   ├── codellama.md
│   │   │   ├── pipeline.md
│   │   │   └── proxy_server.md
│   │   ├── make.bat
│   │   ├── multi_modal/
│   │   │   ├── api_server_vl.md
│   │   │   ├── cogvlm.md
│   │   │   ├── deepseek_vl2.md
│   │   │   ├── gemma3.md
│   │   │   ├── index.rst
│   │   │   ├── internvl.md
│   │   │   ├── llava.md
│   │   │   ├── minicpmv.md
│   │   │   ├── molmo.md
│   │   │   ├── phi3.md
│   │   │   ├── qwen2_5_vl.md
│   │   │   ├── qwen2_vl.md
│   │   │   ├── vl_pipeline.md
│   │   │   └── xcomposer2d5.md
│   │   ├── quantization/
│   │   │   ├── kv_quant.md
│   │   │   ├── llm_compressor.md
│   │   │   ├── w4a16.md
│   │   │   └── w8a8.md
│   │   └── supported_models/
│   │       ├── reward_models.md
│   │       └── supported_models.md
│   └── zh_cn/
│       ├── .readthedocs.yaml
│       ├── Makefile
│       ├── _static/
│       │   └── css/
│       │       └── readthedocs.css
│       ├── advance/
│       │   ├── chat_template.md
│       │   ├── context_parallel.md
│       │   ├── debug_turbomind.md
│       │   ├── long_context.md
│       │   ├── metrics.md
│       │   ├── pytorch_multinodes.md
│       │   ├── pytorch_multithread.md
│       │   ├── pytorch_new_model.md
│       │   ├── pytorch_profiling.md
│       │   ├── spec_decoding.md
│       │   ├── structed_output.md
│       │   └── update_weights.md
│       ├── api/
│       │   ├── cli.rst
│       │   ├── openapi.rst
│       │   └── pipeline.rst
│       ├── benchmark/
│       │   ├── benchmark.md
│       │   ├── evaluate_with_opencompass.md
│       │   └── evaluate_with_vlmevalkit.md
│       ├── conf.py
│       ├── faq.md
│       ├── get_started/
│       │   ├── ascend/
│       │   │   └── get_started.md
│       │   ├── camb/
│       │   │   └── get_started.md
│       │   ├── get_started.md
│       │   ├── index.rst
│       │   ├── installation.md
│       │   └── maca/
│       │       └── get_started.md
│       ├── index.rst
│       ├── inference/
│       │   ├── load_hf.md
│       │   ├── pytorch.md
│       │   ├── turbomind.md
│       │   └── turbomind_config.md
│       ├── llm/
│       │   ├── api_server.md
│       │   ├── api_server_lora.md
│       │   ├── api_server_reasoning.md
│       │   ├── api_server_tools.md
│       │   ├── codellama.md
│       │   ├── pipeline.md
│       │   └── proxy_server.md
│       ├── make.bat
│       ├── multi_modal/
│       │   ├── api_server_vl.md
│       │   ├── cogvlm.md
│       │   ├── deepseek_vl2.md
│       │   ├── gemma3.md
│       │   ├── index.rst
│       │   ├── internvl.md
│       │   ├── llava.md
│       │   ├── minicpmv.md
│       │   ├── molmo.md
│       │   ├── phi3.md
│       │   ├── qwen2_5_vl.md
│       │   ├── qwen2_vl.md
│       │   ├── vl_pipeline.md
│       │   └── xcomposer2d5.md
│       ├── quantization/
│       │   ├── kv_quant.md
│       │   ├── llm_compressor.md
│       │   ├── w4a16.md
│       │   └── w8a8.md
│       └── supported_models/
│           ├── reward_models.md
│           └── supported_models.md
├── eval/
│   ├── config.py
│   └── eval.py
├── examples/
│   └── lite/
│       ├── qwen3_30b_a3b_awq.py
│       └── qwen3_30b_a3b_gptq.py
├── generate.sh
├── k8s/
│   ├── deployment.yaml
│   └── service.yaml
├── lmdeploy/
│   ├── __init__.py
│   ├── __main__.py
│   ├── api.py
│   ├── archs.py
│   ├── cli/
│   │   ├── __init__.py
│   │   ├── chat.py
│   │   ├── cli.py
│   │   ├── entrypoint.py
│   │   ├── lite.py
│   │   ├── serve.py
│   │   └── utils.py
│   ├── lite/
│   │   ├── __init__.py
│   │   ├── apis/
│   │   │   ├── __init__.py
│   │   │   ├── auto_awq.py
│   │   │   ├── calibrate.py
│   │   │   ├── get_small_sharded_hf.py
│   │   │   ├── gptq.py
│   │   │   └── smooth_quant.py
│   │   ├── defaults.py
│   │   ├── modeling/
│   │   │   ├── __init__.py
│   │   │   ├── internlm2_gptq.py
│   │   │   └── internlm3_gptq.py
│   │   ├── quantization/
│   │   │   ├── __init__.py
│   │   │   ├── activation/
│   │   │   │   ├── __init__.py
│   │   │   │   └── observer.py
│   │   │   ├── awq.py
│   │   │   ├── calibration.py
│   │   │   ├── modules/
│   │   │   │   ├── __init__.py
│   │   │   │   └── linear.py
│   │   │   └── weight/
│   │   │       ├── __init__.py
│   │   │       ├── quant_utils.py
│   │   │       └── quantizer.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── batch_split.py
│   │       ├── cal_qparams.py
│   │       ├── calib_dataloader.py
│   │       ├── collect.py
│   │       ├── global_avail.py
│   │       ├── load.py
│   │       └── memory_efficient.py
│   ├── logger.py
│   ├── messages.py
│   ├── metrics/
│   │   ├── __init__.py
│   │   ├── loggers.py
│   │   ├── metrics_processor.py
│   │   └── stats.py
│   ├── model.py
│   ├── monitoring/
│   │   ├── docker-compose.yaml
│   │   ├── grafana/
│   │   │   ├── dashboards/
│   │   │   │   ├── config/
│   │   │   │   │   └── dashboard.yaml
│   │   │   │   └── json/
│   │   │   │       └── lmdeploy-dashboard.json
│   │   │   └── datasources/
│   │   │       └── datasource.yaml
│   │   └── prometheus.yaml
│   ├── pipeline.py
│   ├── profiler.py
│   ├── pytorch/
│   │   ├── __init__.py
│   │   ├── adapter/
│   │   │   ├── __init__.py
│   │   │   └── adapter.py
│   │   ├── backends/
│   │   │   ├── __init__.py
│   │   │   ├── activation.py
│   │   │   ├── apply_rotary_emb.py
│   │   │   ├── attention.py
│   │   │   ├── awq_modules.py
│   │   │   ├── base.py
│   │   │   ├── blockedf8_modules.py
│   │   │   ├── causal_conv1d.py
│   │   │   ├── cuda/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── apply_rotary_emb.py
│   │   │   │   ├── attention/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── default.py
│   │   │   │   │   ├── fa3.py
│   │   │   │   │   └── mla.py
│   │   │   │   ├── awq_modules.py
│   │   │   │   ├── blockedf8_modules.py
│   │   │   │   ├── causal_conv1d.py
│   │   │   │   ├── flash_attention.py
│   │   │   │   ├── gated_delta_rule.py
│   │   │   │   ├── graph_runner.py
│   │   │   │   ├── lora.py
│   │   │   │   ├── moe/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── blocked_fp8.py
│   │   │   │   │   ├── default.py
│   │   │   │   │   ├── ep_utils.py
│   │   │   │   │   └── w8a8.py
│   │   │   │   ├── moe_router.py
│   │   │   │   ├── multinomial_sampling.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── nsa.py
│   │   │   │   ├── op_backend.py
│   │   │   │   ├── qmodules.py
│   │   │   │   ├── token_dispatcher.py
│   │   │   │   ├── utils.py
│   │   │   │   └── warmup_manager.py
│   │   │   ├── deepep_moe_checker.py
│   │   │   ├── default/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── apply_rotary_emb.py
│   │   │   │   ├── awq_modules.py
│   │   │   │   ├── embedding.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── moe.py
│   │   │   │   ├── moe_router.py
│   │   │   │   ├── multinomial_sampling.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── op_backend.py
│   │   │   │   ├── rotary_embedding.py
│   │   │   │   └── token_dispatcher.py
│   │   │   ├── dlinfer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── apply_rotary_emb.py
│   │   │   │   ├── ascend/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── op_backend.py
│   │   │   │   │   └── utils.py
│   │   │   │   ├── attention.py
│   │   │   │   ├── awq_modules.py
│   │   │   │   ├── camb/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── op_backend.py
│   │   │   │   ├── flash_attention.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── maca/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── op_backend.py
│   │   │   │   ├── moe.py
│   │   │   │   ├── norm.py
│   │   │   │   ├── op_backend.py
│   │   │   │   ├── qmodules.py
│   │   │   │   └── rotary_embedding.py
│   │   │   ├── embedding.py
│   │   │   ├── flash_attention.py
│   │   │   ├── gated_delta_rule.py
│   │   │   ├── graph_runner.py
│   │   │   ├── linear.py
│   │   │   ├── lora.py
│   │   │   ├── moe.py
│   │   │   ├── moe_router.py
│   │   │   ├── multinomial_sampling.py
│   │   │   ├── norm.py
│   │   │   ├── nsa.py
│   │   │   ├── qmodules.py
│   │   │   ├── rotary_embedding.py
│   │   │   ├── selector.py
│   │   │   └── token_dispatcher.py
│   │   ├── block.py
│   │   ├── check_env/
│   │   │   ├── __init__.py
│   │   │   ├── adapter.py
│   │   │   ├── base.py
│   │   │   ├── cuda.py
│   │   │   ├── deeplink.py
│   │   │   ├── dist.py
│   │   │   ├── model.py
│   │   │   ├── torch.py
│   │   │   ├── transformers.py
│   │   │   ├── triton.py
│   │   │   └── triton_custom_add.py
│   │   ├── config.py
│   │   ├── configurations/
│   │   │   ├── __init__.py
│   │   │   ├── builder.py
│   │   │   ├── chatglm.py
│   │   │   ├── cogvlm.py
│   │   │   ├── deepseek_v2.py
│   │   │   ├── deepseek_v32.py
│   │   │   ├── deepseek_vl2.py
│   │   │   ├── default.py
│   │   │   ├── gemma.py
│   │   │   ├── glm4.py
│   │   │   ├── gpt_oss.py
│   │   │   ├── interns1_pro.py
│   │   │   ├── internvl.py
│   │   │   ├── internvl3_hf.py
│   │   │   ├── llama.py
│   │   │   ├── llama4.py
│   │   │   ├── llava_hf.py
│   │   │   ├── minicpm3.py
│   │   │   ├── qwen.py
│   │   │   ├── qwen3_5.py
│   │   │   ├── qwen3_next.py
│   │   │   ├── qwen3_vl.py
│   │   │   ├── sdar.py
│   │   │   └── utils.py
│   │   ├── consts.py
│   │   ├── devices/
│   │   │   ├── __init__.py
│   │   │   └── device_manager.py
│   │   ├── disagg/
│   │   │   ├── README.md
│   │   │   ├── __init__.py
│   │   │   ├── backend/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── backend.py
│   │   │   │   ├── base.py
│   │   │   │   ├── dlslime.py
│   │   │   │   └── mooncake.py
│   │   │   ├── config.py
│   │   │   ├── conn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── engine_conn.py
│   │   │   │   ├── protocol.py
│   │   │   │   └── proxy_conn.py
│   │   │   └── messages.py
│   │   ├── distributed.py
│   │   ├── engine/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── cache_engine.py
│   │   │   ├── config_builder.py
│   │   │   ├── engine.py
│   │   │   ├── engine_checker.py
│   │   │   ├── engine_instance.py
│   │   │   ├── engine_loop.py
│   │   │   ├── executor/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── base_worker.py
│   │   │   │   ├── dist_utils.py
│   │   │   │   ├── mp_executor.py
│   │   │   │   ├── ray_executor.py
│   │   │   │   └── uni_executor.py
│   │   │   ├── guided_process.py
│   │   │   ├── input_process.py
│   │   │   ├── inputs_maker.py
│   │   │   ├── logits_process.py
│   │   │   ├── model_agent/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── inputs_maker.py
│   │   │   │   └── profiler.py
│   │   │   ├── mp_engine/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── base_worker.py
│   │   │   │   ├── ray_engine.py
│   │   │   │   ├── zmq_engine.py
│   │   │   │   └── zmq_rpc.py
│   │   │   └── request.py
│   │   ├── envs.py
│   │   ├── kernels/
│   │   │   ├── __init__.py
│   │   │   ├── cuda/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── apply_rotary_pos_emb.py
│   │   │   │   ├── awq_kernels.py
│   │   │   │   ├── bitonic_topk.py
│   │   │   │   ├── blocked_fp8_fused_moe.py
│   │   │   │   ├── blocked_gemm_fp8.py
│   │   │   │   ├── causal_conv1d.py
│   │   │   │   ├── ds_index.py
│   │   │   │   ├── fill_kv_cache.py
│   │   │   │   ├── flashattention.py
│   │   │   │   ├── flatten_kv_cache.py
│   │   │   │   ├── fused_lora.py
│   │   │   │   ├── fused_moe.py
│   │   │   │   ├── fused_moe_ep.py
│   │   │   │   ├── fused_noaux_tc.py
│   │   │   │   ├── gated_delta_rule.py
│   │   │   │   ├── multinomial_sampling.py
│   │   │   │   ├── pagedattention.py
│   │   │   │   ├── rms_norm.py
│   │   │   │   ├── utils.py
│   │   │   │   ├── w8a8_fused_moe.py
│   │   │   │   └── w8a8_triton_kernels.py
│   │   │   ├── default/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── multinomial_sampling.py
│   │   │   │   └── w8a8_kernels.py
│   │   │   ├── dispatcher.py
│   │   │   ├── dlinfer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── activation.py
│   │   │   │   ├── apply_rotary_pos_emb.py
│   │   │   │   ├── awq_kernels.py
│   │   │   │   ├── fill_kv_cache.py
│   │   │   │   ├── flash_attention.py
│   │   │   │   ├── fused_moe.py
│   │   │   │   ├── fused_rotary_emb.py
│   │   │   │   ├── linear.py
│   │   │   │   ├── moe_gating_topk_softmax.py
│   │   │   │   ├── pagedattention.py
│   │   │   │   ├── rms_norm.py
│   │   │   │   └── w8a8_kernels.py
│   │   │   └── w8a8_triton_kernels.py
│   │   ├── messages.py
│   │   ├── model_inputs.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── baichuan.py
│   │   │   ├── chatglm2.py
│   │   │   ├── cogvlm.py
│   │   │   ├── deepseek.py
│   │   │   ├── deepseek_mtp.py
│   │   │   ├── deepseek_v2.py
│   │   │   ├── deepseek_v32.py
│   │   │   ├── deepseek_vl2.py
│   │   │   ├── gemma.py
│   │   │   ├── gemma3_vl.py
│   │   │   ├── glm4.py
│   │   │   ├── glm4_1v.py
│   │   │   ├── glm4_moe.py
│   │   │   ├── glm4moe_mtp.py
│   │   │   ├── gpt_oss.py
│   │   │   ├── internlm.py
│   │   │   ├── internlm2.py
│   │   │   ├── internlm2_reward.py
│   │   │   ├── internlm2_ve.py
│   │   │   ├── internlm3.py
│   │   │   ├── interns1_pro.py
│   │   │   ├── interns1_pro_ts.py
│   │   │   ├── internvl.py
│   │   │   ├── internvl3_hf.py
│   │   │   ├── internvl_patch.py
│   │   │   ├── llama.py
│   │   │   ├── llama4.py
│   │   │   ├── llama_eagle.py
│   │   │   ├── llama_eagle3.py
│   │   │   ├── llava.py
│   │   │   ├── minicpm3.py
│   │   │   ├── minicpmv26.py
│   │   │   ├── mistral.py
│   │   │   ├── mixtral.py
│   │   │   ├── module_map.py
│   │   │   ├── patch.py
│   │   │   ├── phi3.py
│   │   │   ├── phi3_moe.py
│   │   │   ├── phi3_v.py
│   │   │   ├── q_modules.py
│   │   │   ├── qwen.py
│   │   │   ├── qwen2.py
│   │   │   ├── qwen2_5_vl.py
│   │   │   ├── qwen2_moe.py
│   │   │   ├── qwen2_reward.py
│   │   │   ├── qwen2_vl.py
│   │   │   ├── qwen3.py
│   │   │   ├── qwen3_5.py
│   │   │   ├── qwen3_5_moe.py
│   │   │   ├── qwen3_moe.py
│   │   │   ├── qwen3_next.py
│   │   │   ├── qwen3_vl.py
│   │   │   ├── qwen3_vl_moe.py
│   │   │   ├── sdar.py
│   │   │   ├── sdar_moe.py
│   │   │   ├── siglip.py
│   │   │   ├── starcoder2.py
│   │   │   ├── utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── cudagraph.py
│   │   │   │   ├── micro_batch.py
│   │   │   │   └── model.py
│   │   │   └── whisper.py
│   │   ├── multimodal/
│   │   │   ├── __init__.py
│   │   │   └── data_type.py
│   │   ├── nn/
│   │   │   ├── __init__.py
│   │   │   ├── activation.py
│   │   │   ├── attention.py
│   │   │   ├── embedding.py
│   │   │   ├── eplb.py
│   │   │   ├── gated_delta.py
│   │   │   ├── linear/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── awq.py
│   │   │   │   ├── base.py
│   │   │   │   ├── blocked_fp8.py
│   │   │   │   ├── default.py
│   │   │   │   ├── lora.py
│   │   │   │   ├── utils.py
│   │   │   │   └── w8a8.py
│   │   │   ├── moe/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── blocked_fp8.py
│   │   │   │   ├── default.py
│   │   │   │   ├── route.py
│   │   │   │   └── w8a8.py
│   │   │   ├── multinomial_sampling.py
│   │   │   ├── norm.py
│   │   │   ├── nsa.py
│   │   │   ├── quant_utils.py
│   │   │   ├── rotary_embedding.py
│   │   │   └── utils.py
│   │   ├── paging/
│   │   │   ├── __init__.py
│   │   │   ├── block_manager/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base_block_manager.py
│   │   │   │   ├── default_block_manager.py
│   │   │   │   └── window_block_manager.py
│   │   │   ├── block_trie.py
│   │   │   ├── eviction_helper/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base_eviction_helper.py
│   │   │   │   └── recompute_eviction_helper.py
│   │   │   ├── scheduler.py
│   │   │   ├── seq_states/
│   │   │   │   ├── __init__.py
│   │   │   │   └── states.py
│   │   │   └── state_manager.py
│   │   ├── ray.py
│   │   ├── spec_decode/
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── proposers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── deepseek_mtp.py
│   │   │   │   ├── eagle.py
│   │   │   │   └── eagle3.py
│   │   │   ├── reject_sampler.py
│   │   │   └── spec_agent.py
│   │   ├── strategies/
│   │   │   ├── __init__.py
│   │   │   ├── ar/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── cudagraph.py
│   │   │   │   ├── engine.py
│   │   │   │   ├── model_agent.py
│   │   │   │   ├── model_inputs.py
│   │   │   │   ├── sampling.py
│   │   │   │   └── sequence.py
│   │   │   ├── ar_spec/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── cudagraph.py
│   │   │   │   ├── engine.py
│   │   │   │   ├── model_agent.py
│   │   │   │   ├── model_inputs.py
│   │   │   │   ├── sampling.py
│   │   │   │   └── sequence.py
│   │   │   ├── base/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── cudagraph.py
│   │   │   │   ├── engine.py
│   │   │   │   ├── model_agent.py
│   │   │   │   ├── model_inputs.py
│   │   │   │   ├── sampling.py
│   │   │   │   └── sequence.py
│   │   │   └── dllm/
│   │   │       ├── __init__.py
│   │   │       ├── cudagraph.py
│   │   │       ├── engine.py
│   │   │       ├── model_agent.py
│   │   │       ├── model_inputs.py
│   │   │       ├── sampling.py
│   │   │       ├── sequence.py
│   │   │       └── unmasking.py
│   │   ├── third_party/
│   │   │   ├── __init__.py
│   │   │   ├── deep_gemm/
│   │   │   │   └── __init__.py
│   │   │   └── flash_attn_interface.py
│   │   ├── tools/
│   │   │   ├── __init__.py
│   │   │   └── utils.py
│   │   ├── transformers/
│   │   │   ├── __init__.py
│   │   │   └── configuration_deepseek_v32.py
│   │   ├── utils.py
│   │   └── weight_loader/
│   │       ├── __init__.py
│   │       └── model_weight_loader.py
│   ├── serve/
│   │   ├── __init__.py
│   │   ├── core/
│   │   │   ├── __init__.py
│   │   │   ├── async_engine.py
│   │   │   ├── exceptions.py
│   │   │   └── vl_async_engine.py
│   │   ├── managers/
│   │   │   ├── __init__.py
│   │   │   └── session_manager.py
│   │   ├── openai/
│   │   │   ├── __init__.py
│   │   │   ├── api_client.py
│   │   │   ├── api_server.py
│   │   │   ├── harmony_utils.py
│   │   │   ├── launch_server.py
│   │   │   ├── protocol.py
│   │   │   ├── reasoning_parser/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── deepseek_r1_reasoning_parser.py
│   │   │   │   ├── qwen_qwq_reasoning_parser.py
│   │   │   │   └── reasoning_parser.py
│   │   │   ├── serving_chat_completion.py
│   │   │   ├── serving_completion.py
│   │   │   ├── serving_generate.py
│   │   │   └── tool_parser/
│   │   │       ├── __init__.py
│   │   │       ├── internlm2_parser.py
│   │   │       ├── llama3_parser.py
│   │   │       ├── qwen2d5_parser.py
│   │   │       ├── qwen3_parser.py
│   │   │       ├── qwen3coder_parser.py
│   │   │       ├── tool_parser.py
│   │   │       └── utils.py
│   │   ├── processors/
│   │   │   ├── __init__.py
│   │   │   └── multimodal.py
│   │   ├── proxy/
│   │   │   ├── __init__.py
│   │   │   ├── proxy.py
│   │   │   ├── streaming_response.py
│   │   │   └── utils.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       └── server_utils.py
│   ├── tokenizer.py
│   ├── turbomind/
│   │   ├── __init__.py
│   │   ├── deploy/
│   │   │   ├── __init__.py
│   │   │   ├── config.py
│   │   │   ├── converter.py
│   │   │   ├── loader.py
│   │   │   ├── module.py
│   │   │   ├── parameter.py
│   │   │   ├── policy.py
│   │   │   ├── source_model/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── baichuan.py
│   │   │   │   ├── base.py
│   │   │   │   ├── deepseek2.py
│   │   │   │   ├── deepseek_vl.py
│   │   │   │   ├── glm4.py
│   │   │   │   ├── glm4_moe_lite.py
│   │   │   │   ├── gpt_oss.py
│   │   │   │   ├── internlm2.py
│   │   │   │   ├── internvl.py
│   │   │   │   ├── llama.py
│   │   │   │   ├── llava.py
│   │   │   │   ├── minicpmv.py
│   │   │   │   ├── mixtral.py
│   │   │   │   ├── molmo.py
│   │   │   │   ├── qwen.py
│   │   │   │   └── xcomposer2.py
│   │   │   └── target_model/
│   │   │       ├── __init__.py
│   │   │       ├── base.py
│   │   │       └── fp.py
│   │   ├── supported_models.py
│   │   ├── tokenizer_info.py
│   │   └── turbomind.py
│   ├── utils.py
│   ├── version.py
│   └── vl/
│       ├── __init__.py
│       ├── constants.py
│       ├── engine.py
│       ├── media/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── connection.py
│       │   ├── image.py
│       │   ├── time_series.py
│       │   ├── video.py
│       │   └── video_loader.py
│       ├── model/
│       │   ├── __init__.py
│       │   ├── base.py
│       │   ├── builder.py
│       │   ├── cogvlm.py
│       │   ├── deepseek.py
│       │   ├── deepseek_vl2.py
│       │   ├── gemma3_vl.py
│       │   ├── glm4_1v.py
│       │   ├── glm4_v.py
│       │   ├── interns1_pro.py
│       │   ├── internvl.py
│       │   ├── internvl3_hf.py
│       │   ├── internvl_llava.py
│       │   ├── llama4.py
│       │   ├── llava.py
│       │   ├── llava_hf.py
│       │   ├── llava_next.py
│       │   ├── minicpmv.py
│       │   ├── mllama.py
│       │   ├── molmo.py
│       │   ├── phi3_vision.py
│       │   ├── qwen.py
│       │   ├── qwen2.py
│       │   ├── qwen3.py
│       │   ├── qwen3_5.py
│       │   ├── utils.py
│       │   ├── xcomposer2.py
│       │   └── yi.py
│       ├── tools/
│       │   ├── __init__.py
│       │   └── merge_xcomposer2d5_task.py
│       └── utils.py
├── pyproject.toml
├── setup.py
├── src/
│   ├── CMakeLists.txt
│   └── turbomind/
│       ├── CMakeLists.txt
│       ├── comm/
│       │   ├── CMakeLists.txt
│       │   ├── barrier.h
│       │   ├── cuda_ipc/
│       │   │   ├── CMakeLists.txt
│       │   │   ├── allgather.cu
│       │   │   ├── allreduce.cu
│       │   │   ├── bootstrap.h
│       │   │   ├── broadcast.cu
│       │   │   ├── common.h
│       │   │   ├── cuda_ipc_comm.cu
│       │   │   ├── cuda_ipc_comm.h
│       │   │   ├── fused_allreduce.cu
│       │   │   ├── fused_allreduce_ex.cu
│       │   │   ├── group_sum.h
│       │   │   ├── mscclpp.h
│       │   │   ├── multimem.cuh
│       │   │   ├── semaphore.cuh
│       │   │   └── semaphore.h
│       │   ├── device_comm.cc
│       │   ├── device_comm.h
│       │   ├── env.h
│       │   ├── gloo/
│       │   │   ├── CMakeLists.txt
│       │   │   ├── gloo_comm.cc
│       │   │   ├── hybrid_comm.cc
│       │   │   ├── tcp_store.cc
│       │   │   ├── tcp_store.h
│       │   │   └── test_ipc_comm.cc
│       │   ├── host_comm.cc
│       │   ├── host_comm.h
│       │   ├── nccl/
│       │   │   ├── CMakeLists.txt
│       │   │   └── nccl.cu
│       │   ├── test_comm.cu
│       │   ├── test_host_comm.cc
│       │   └── thread_comm.cc
│       ├── core/
│       │   ├── CMakeLists.txt
│       │   ├── allocator.cc
│       │   ├── allocator.h
│       │   ├── buffer.cc
│       │   ├── buffer.h
│       │   ├── check.cc
│       │   ├── check.h
│       │   ├── common.h
│       │   ├── context.cc
│       │   ├── context.h
│       │   ├── copy.cc
│       │   ├── copy.h
│       │   ├── core.h
│       │   ├── cuda_data_type.h
│       │   ├── data_type.h
│       │   ├── interval.h
│       │   ├── layout.cc
│       │   ├── layout.h
│       │   ├── module.cc
│       │   ├── module.h
│       │   ├── ranges.h
│       │   ├── serdes.h
│       │   ├── state.h
│       │   ├── stream.cc
│       │   ├── stream.h
│       │   ├── tensor.cc
│       │   ├── tensor.cu
│       │   ├── tensor.h
│       │   └── test_core.cc
│       ├── engine/
│       │   ├── CMakeLists.txt
│       │   ├── batch.h
│       │   ├── engine.cc
│       │   ├── engine.h
│       │   ├── gateway.cc
│       │   ├── gateway.h
│       │   ├── model_executor.cc
│       │   ├── model_executor.h
│       │   ├── model_request.cc
│       │   ├── model_request.h
│       │   ├── queue.h
│       │   ├── request.cc
│       │   ├── request.h
│       │   ├── request_queue.cc
│       │   ├── request_queue.h
│       │   └── signal_buffer.h
│       ├── generation/
│       │   ├── CMakeLists.txt
│       │   ├── base_param.h
│       │   ├── generation.cc
│       │   ├── generation.h
│       │   ├── guided_decoding.cc
│       │   ├── guided_decoding.h
│       │   ├── logits_processor.cc
│       │   ├── logits_processor.h
│       │   ├── sampling.cc
│       │   ├── sampling.h
│       │   ├── stop_criteria.cc
│       │   ├── stop_criteria.h
│       │   └── utils.h
│       ├── kernels/
│       │   ├── CMakeLists.txt
│       │   ├── activation.cu
│       │   ├── activation.h
│       │   ├── activation_kernels.cu
│       │   ├── activation_kernels.h
│       │   ├── apply_token_bitmask_inplace_cuda.cu
│       │   ├── apply_token_bitmask_inplace_cuda.h
│       │   ├── attention/
│       │   │   ├── CMakeLists.txt
│       │   │   ├── arch.h
│       │   │   ├── attention.cu
│       │   │   ├── attention.h
│       │   │   ├── attention_params.h
│       │   │   ├── attention_template.h
│       │   │   ├── attention_universal.h
│       │   │   ├── block.h
│       │   │   ├── block_iterator.h
│       │   │   ├── cp_utils.cu
│       │   │   ├── cp_utils.h
│       │   │   ├── cta_map.h
│       │   │   ├── decoding.cu
│       │   │   ├── decoding.h
│       │   │   ├── decoding_template.h
│       │   │   ├── desc.h
│       │   │   ├── impl.h
│       │   │   ├── impl_16816.h
│       │   │   ├── impl_1688.h
│       │   │   ├── impl_81616.h
│       │   │   ├── impl_884.h
│       │   │   ├── impl_m16n8.h
│       │   │   ├── impl_simt.h
│       │   │   ├── iterator.h
│       │   │   ├── iterator_sm70.h
│       │   │   ├── iterator_sm80.h
│       │   │   ├── kernel/
│       │   │   │   ├── CMakeLists.txt
│       │   │   │   ├── attention_sm70_128.cu
│       │   │   │   ├── attention_sm70_256.cu
│       │   │   │   ├── attention_sm70_576.cu
│       │   │   │   ├── attention_sm70_64.cu
│       │   │   │   ├── attention_sm75_128.cu
│       │   │   │   ├── attention_sm75_256.cu
│       │   │   │   ├── attention_sm75_576.cu
│       │   │   │   ├── attention_sm75_64.cu
│       │   │   │   ├── attention_sm80_128.cu
│       │   │   │   ├── attention_sm80_192.cu
│       │   │   │   ├── attention_sm80_256.cu
│       │   │   │   ├── attention_sm80_576.cu
│       │   │   │   ├── attention_sm80_64.cu
│       │   │   │   ├── decoding_sm70_128.cu
│       │   │   │   ├── decoding_sm70_256.cu
│       │   │   │   ├── decoding_sm70_576.cu
│       │   │   │   ├── decoding_sm70_64.cu
│       │   │   │   ├── decoding_sm75_128.cu
│       │   │   │   ├── decoding_sm75_256.cu
│       │   │   │   ├── decoding_sm75_576.cu
│       │   │   │   ├── decoding_sm75_64.cu
│       │   │   │   ├── decoding_sm80_128.cu
│       │   │   │   ├── decoding_sm80_192.cu
│       │   │   │   ├── decoding_sm80_256.cu
│       │   │   │   ├── decoding_sm80_576.cu
│       │   │   │   └── decoding_sm80_64.cu
│       │   │   ├── kernel.h
│       │   │   ├── kernel_impl.h
│       │   │   ├── kv_cache_utils_v2.cu
│       │   │   ├── kv_cache_utils_v2.h
│       │   │   ├── linear_iterator.h
│       │   │   ├── mainloop.h
│       │   │   ├── mainloop_sm70.h
│       │   │   ├── mainloop_sm80.h
│       │   │   ├── quantization.h
│       │   │   ├── reduce.cu
│       │   │   ├── reduce.h
│       │   │   ├── reference.cu
│       │   │   ├── reference.h
│       │   │   ├── registrar.h
│       │   │   ├── registry.cu
│       │   │   ├── registry.h
│       │   │   ├── rotary_embedding.h
│       │   │   ├── test_attention.cu
│       │   │   ├── test_quant.cu
│       │   │   ├── test_utils.cu
│       │   │   ├── test_utils.h
│       │   │   ├── utils.cc
│       │   │   └── utils.h
│       │   ├── ban_bad_words.cu
│       │   ├── ban_bad_words.h
│       │   ├── core/
│       │   │   ├── array.h
│       │   │   ├── array_ops.h
│       │   │   ├── common.h
│       │   │   ├── data_type.h
│       │   │   ├── floating_point.h
│       │   │   ├── layout.h
│       │   │   ├── math.h
│       │   │   ├── meta.h
│       │   │   ├── mma.h
│       │   │   ├── pipe_iter.h
│       │   │   ├── smem.h
│       │   │   ├── sub_byte_ptr.h
│       │   │   ├── sync.h
│       │   │   └── thread_map.h
│       │   ├── decoding_kernels.cu
│       │   ├── decoding_kernels.h
│       │   ├── gemm/
│       │   │   ├── CMakeLists.txt
│       │   │   ├── arch/
│       │   │   │   ├── config_simt.h
│       │   │   │   ├── config_sm70_s884.h
│       │   │   │   ├── config_sm75_s16816.h
│       │   │   │   ├── config_sm80_s16816.h
│       │   │   │   ├── mma_simt.h
│       │   │   │   ├── mma_sm70.h
│       │   │   │   ├── mma_sm80.h
│       │   │   │   ├── operand_simt.h
│       │   │   │   ├── operand_sm70_s884.h
│       │   │   │   ├── operand_sm80_s16816.h
│       │   │   │   ├── smem_copy_simt.h
│       │   │   │   ├── smem_copy_sm70.h
│       │   │   │   └── smem_copy_sm80.h
│       │   │   ├── arch.h
│       │   │   ├── cast.cu
│       │   │   ├── cast.h
│       │   │   ├── context.cu
│       │   │   ├── context.h
│       │   │   ├── convert.cuh
│       │   │   ├── convert.h
│       │   │   ├── convert_v3.cu
│       │   │   ├── cp_async.h
│       │   │   ├── cta_map.h
│       │   │   ├── cublas.cu
│       │   │   ├── desc.h
│       │   │   ├── dispatch_cache.cu
│       │   │   ├── dispatch_cache.h
│       │   │   ├── epilogue.h
│       │   │   ├── format.h
│       │   │   ├── gemm.cu
│       │   │   ├── gemm.h
│       │   │   ├── gemm_universal.h
│       │   │   ├── gemm_universal_sm90.h
│       │   │   ├── gemm_universal_sm90_v2.h
│       │   │   ├── gemm_universal_sm90_v3.h
│       │   │   ├── gemm_universal_sm90_v4.h
│       │   │   ├── gemm_universal_sm90_v5.h
│       │   │   ├── gpu_metric.cu
│       │   │   ├── gpu_metric.h
│       │   │   ├── iterator.h
│       │   │   ├── iterator_sm70.h
│       │   │   ├── iterator_sm80.h
│       │   │   ├── iterator_sm90.h
│       │   │   ├── kernel/
│       │   │   │   ├── sm70_884_16.cu
│       │   │   │   ├── sm70_884_4.cu
│       │   │   │   ├── sm70_884_8.cu
│       │   │   │   ├── sm75_16816_16.cu
│       │   │   │   ├── sm75_16816_4.cu
│       │   │   │   ├── sm75_16816_8.cu
│       │   │   │   ├── sm80_16816_16.cu
│       │   │   │   ├── sm80_16816_4.cu
│       │   │   │   ├── sm80_16816_8.cu
│       │   │   │   ├── sm90_16816_16.cu
│       │   │   │   ├── sm90_16816_4.cu
│       │   │   │   ├── sm90_16816_8.cu
│       │   │   │   └── sm90_64n32_8.cu
│       │   │   ├── kernel.cu
│       │   │   ├── kernel.h
│       │   │   ├── kernel_impl.h
│       │   │   ├── kernel_impl_sm90.h
│       │   │   ├── mainloop_sm70.h
│       │   │   ├── mainloop_sm80_v2.h
│       │   │   ├── matrix_ptr.h
│       │   │   ├── moe_utils_v2.cu
│       │   │   ├── moe_utils_v2.h
│       │   │   ├── operand.h
│       │   │   ├── predicate.h
│       │   │   ├── registry.cu
│       │   │   ├── registry.h
│       │   │   ├── scaled_gmma_fp8_sm90.h
│       │   │   ├── scheduler.cuh
│       │   │   ├── scheduler_sm70.cuh
│       │   │   ├── simt.h
│       │   │   ├── sm90_utils.h
│       │   │   ├── smem_copy.h
│       │   │   ├── test/
│       │   │   │   ├── gemm_bench.cu
│       │   │   │   ├── models.h
│       │   │   │   ├── quantization.cu
│       │   │   │   ├── quantization.h
│       │   │   │   ├── quantization_impl.h
│       │   │   │   ├── reference.cu
│       │   │   │   ├── reference.h
│       │   │   │   ├── test_gemm_v2.cc
│       │   │   │   ├── test_moe_utils.cu
│       │   │   │   ├── test_utils.cu
│       │   │   │   ├── test_utils.h
│       │   │   │   └── testbed_v3.h
│       │   │   ├── thread_group_map.h
│       │   │   ├── thread_map.h
│       │   │   ├── tiled_mma.h
│       │   │   ├── tma.cu
│       │   │   ├── tma.h
│       │   │   ├── transform.h
│       │   │   ├── tuner/
│       │   │   │   ├── cache_utils.cu
│       │   │   │   ├── cache_utils.h
│       │   │   │   ├── measurer.cu
│       │   │   │   ├── measurer.h
│       │   │   │   ├── params.cc
│       │   │   │   ├── params.h
│       │   │   │   ├── sampler.cu
│       │   │   │   ├── sampler.h
│       │   │   │   ├── stats.h
│       │   │   │   ├── stopping_criterion.cc
│       │   │   │   └── stopping_criterion.h
│       │   │   ├── types.h
│       │   │   ├── unpack.cu
│       │   │   └── utils.h
│       │   ├── gpt_kernels.cu
│       │   ├── gpt_kernels.h
│       │   ├── logprob_kernels.cu
│       │   ├── logprob_kernels.h
│       │   ├── norm/
│       │   │   ├── CMakeLists.txt
│       │   │   ├── rms_norm.cu
│       │   │   └── rms_norm.h
│       │   ├── penalty_types.h
│       │   ├── quantization.cu
│       │   ├── quantization.cuh
│       │   ├── quantization.h
│       │   ├── reduce_kernel_utils.cuh
│       │   ├── sampling_kernels.cu
│       │   ├── sampling_kernels.h
│       │   ├── sampling_penalty_kernels.cu
│       │   ├── sampling_penalty_kernels.h
│       │   ├── sampling_topk_kernels.cu
│       │   ├── sampling_topk_kernels.h
│       │   ├── sampling_topp_kernels.cu
│       │   ├── sampling_topp_kernels.h
│       │   ├── stop_criteria_kernels.cu
│       │   ├── stop_criteria_kernels.h
│       │   ├── test_quantization.cc
│       │   ├── unfused_attention_kernels.cu
│       │   └── unfused_attention_kernels.h
│       ├── macro.h
│       ├── models/
│       │   ├── CMakeLists.txt
│       │   ├── input_processor.cc
│       │   ├── input_processor.h
│       │   ├── language_model.cc
│       │   ├── language_model.h
│       │   ├── llama/
│       │   │   ├── Barrier.h
│       │   │   ├── BlockManager.cc
│       │   │   ├── BlockManager.h
│       │   │   ├── BlockTrie.cc
│       │   │   ├── BlockTrie.h
│       │   │   ├── CMakeLists.txt
│       │   │   ├── GatedDeltaNetLayer.cc
│       │   │   ├── GatedDeltaNetLayer.h
│       │   │   ├── GatedDeltaNetWeight.cc
│       │   │   ├── GatedDeltaNetWeight.h
│       │   │   ├── LlamaDecoderLayerWeight.cc
│       │   │   ├── LlamaDecoderLayerWeight.h
│       │   │   ├── LlamaDenseWeight.cc
│       │   │   ├── LlamaDenseWeight.h
│       │   │   ├── LlamaFfnLayer.cc
│       │   │   ├── LlamaFfnLayer.h
│       │   │   ├── LlamaLinear.cu
│       │   │   ├── LlamaLinear.h
│       │   │   ├── LlamaWeight.cc
│       │   │   ├── LlamaWeight.h
│       │   │   ├── SequenceManager.cc
│       │   │   ├── SequenceManager.h
│       │   │   ├── bench_conv1d_silu.cc
│       │   │   ├── bench_gated_delta_net.cc
│       │   │   ├── context.h
│       │   │   ├── gated_delta_net_kernels.cu
│       │   │   ├── gated_delta_net_kernels.h
│       │   │   ├── llama_kernels.cu
│       │   │   ├── llama_kernels.h
│       │   │   ├── llama_params.h
│       │   │   ├── llama_rope.h
│       │   │   ├── llama_utils.cu
│       │   │   ├── llama_utils.h
│       │   │   ├── mla_utils.cu
│       │   │   ├── mla_utils.h
│       │   │   ├── moe_ffn_layer.cc
│       │   │   ├── moe_ffn_layer.h
│       │   │   ├── test_cache_manager.cc
│       │   │   ├── unified_attention_layer.cc
│       │   │   ├── unified_attention_layer.h
│       │   │   ├── unified_decoder.cc
│       │   │   └── unified_decoder.h
│       │   ├── output_processor.cc
│       │   └── output_processor.h
│       ├── python/
│       │   ├── CMakeLists.txt
│       │   ├── bind.cpp
│       │   ├── dlpack.h
│       │   └── xgrammar_bind.cpp
│       ├── turbomind.cc
│       ├── turbomind.h
│       └── utils/
│           ├── CMakeLists.txt
│           ├── anomaly_handler.cu
│           ├── anomaly_handler.h
│           ├── constant.h
│           ├── cuda_bf16_fallbacks.cuh
│           ├── cuda_bf16_wrapper.h
│           ├── cuda_type_utils.cuh
│           ├── cuda_utils.cc
│           ├── cuda_utils.h
│           ├── debug_utils.h
│           ├── dispatch.h
│           ├── logger.cc
│           ├── logger.h
│           ├── memory_utils.cu
│           ├── memory_utils.h
│           ├── metrics.h
│           ├── monotonic.h
│           ├── nvtx_utils.cc
│           ├── nvtx_utils.h
│           ├── parser.cc
│           ├── parser.h
│           ├── string_utils.h
│           └── test_utils.h
└── tests/
    ├── csrc/
    │   ├── CMakeLists.txt
    │   └── unittests/
    │       ├── CMakeLists.txt
    │       ├── gtest_utils.h
    │       ├── test_logprob_kernels.cu
    │       ├── test_penalty_kernels.cu
    │       ├── test_sampling_kernels.cu
    │       ├── test_sampling_layer.cu
    │       └── unittest_utils.h
    ├── pytorch/
    │   ├── config/
    │   │   └── test_hf_overrides.py
    │   ├── engine/
    │   │   ├── test_logits_process.py
    │   │   ├── test_request.py
    │   │   └── test_zmq_rpc.py
    │   ├── kernel/
    │   │   ├── test_activation.py
    │   │   ├── test_apply_rotary.py
    │   │   ├── test_bitonic_topk.py
    │   │   ├── test_causal_conv1d.py
    │   │   ├── test_ds_index.py
    │   │   ├── test_fill_kv_cache.py
    │   │   ├── test_flash_attention.py
    │   │   ├── test_flatten_kv_cache.py
    │   │   ├── test_fuse_moe_blocked_fp8.py
    │   │   ├── test_fused_lora.py
    │   │   ├── test_fused_moe.py
    │   │   ├── test_gated_delta_rule.py
    │   │   ├── test_gemm_fp8.py
    │   │   ├── test_moe_route.py
    │   │   ├── test_multinomial_sampling.py
    │   │   ├── test_paged_attention.py
    │   │   └── test_rms_norm.py
    │   ├── nn/
    │   │   └── test_embedding.py
    │   └── paging/
    │       ├── test_block_manager.py
    │       ├── test_block_trie.py
    │       └── test_scheduler.py
    └── test_lmdeploy/
        ├── test_auto_backend.py
        ├── test_content_merge.py
        ├── test_grammar.py
        ├── test_harmony_gpt_oss_parser.py
        ├── test_lite/
        │   └── test_quantization/
        │       └── test_utils/
        │           └── test_cal_qparams.py
        ├── test_messages.py
        ├── test_model.py
        ├── test_pipeline.py
        ├── test_qwen3_parser.py
        ├── test_qwen3coder_parser.py
        ├── test_tokenizer.py
        ├── test_turbomind/
        │   └── test_converter.py
        ├── test_utils.py
        └── test_vl/
            ├── test_hf_chat_template.py
            ├── test_nonhf_chat_template.py
            ├── test_qwen3vl_processor.py
            └── test_vl_encode.py