gitextract_0e22n38f/

├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── ISSUE_TEMPLATE/
│   │   ├── -bug-.yaml
│   │   ├── -feature-.yaml
│   │   └── config.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── SECURITY.md
│   └── workflows/
│       ├── book-ci.yml
│       ├── deploy.yml
│       ├── docker-image.yml
│       ├── kt-kernel-tests.yml
│       ├── release-fake-tag.yml
│       ├── release-pypi.yml
│       ├── release-sglang-kt.yml
│       └── sync-sglang-submodule.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── MAINTAINERS.md
├── README.md
├── README_ZH.md
├── archive/
│   ├── .devcontainer/
│   │   ├── Dockerfile
│   │   └── devcontainer.json
│   ├── .flake8
│   ├── .gitmodules
│   ├── .pylintrc
│   ├── Dockerfile
│   ├── Dockerfile.xpu
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── Makefile
│   ├── README.md
│   ├── README_LEGACY.md
│   ├── README_ZH.md
│   ├── README_ZH_LEGACY.md
│   ├── SECURITY.md
│   ├── book.toml
│   ├── config.json
│   ├── csrc/
│   │   ├── balance_serve/
│   │   │   └── CMakeLists.txt
│   │   ├── custom_marlin/
│   │   │   ├── __init__.py
│   │   │   ├── binding.cpp
│   │   │   ├── gptq_marlin/
│   │   │   │   ├── gptq_marlin.cu
│   │   │   │   ├── gptq_marlin.cuh
│   │   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   │   ├── gptq_marlin_repack.cu
│   │   │   │   └── ops.h
│   │   │   ├── setup.py
│   │   │   ├── test_cuda_graph.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── format24.py
│   │   │       ├── marlin_24_perms.py
│   │   │       ├── marlin_perms.py
│   │   │       ├── marlin_utils.py
│   │   │       └── quant_utils.py
│   │   └── ktransformers_ext/
│   │       ├── CMakeLists.txt
│   │       ├── bench/
│   │       │   ├── bench_attention.py
│   │       │   ├── bench_attention_torch.py
│   │       │   ├── bench_linear.py
│   │       │   ├── bench_linear_torch.py
│   │       │   ├── bench_mlp.py
│   │       │   ├── bench_mlp_torch.py
│   │       │   ├── bench_moe.py
│   │       │   ├── bench_moe_amx.py
│   │       │   └── bench_moe_torch.py
│   │       ├── cmake/
│   │       │   └── FindSIMD.cmake
│   │       ├── cpu_backend/
│   │       │   ├── backend.cpp
│   │       │   ├── backend.h
│   │       │   ├── cpuinfer.h
│   │       │   ├── shared_mem_buffer.cpp
│   │       │   ├── shared_mem_buffer.h
│   │       │   ├── task_queue.cpp
│   │       │   ├── task_queue.h
│   │       │   └── vendors/
│   │       │       ├── README.md
│   │       │       ├── cuda.h
│   │       │       ├── hip.h
│   │       │       ├── musa.h
│   │       │       └── vendor.h
│   │       ├── cuda/
│   │       │   ├── binding.cpp
│   │       │   ├── custom_gguf/
│   │       │   │   ├── dequant.cu
│   │       │   │   └── ops.h
│   │       │   ├── gptq_marlin/
│   │       │   │   ├── gptq_marlin.cu
│   │       │   │   ├── gptq_marlin.cuh
│   │       │   │   ├── gptq_marlin_dtypes.cuh
│   │       │   │   └── ops.h
│   │       │   ├── setup.py
│   │       │   └── test_dequant.py
│   │       ├── examples/
│   │       │   ├── test_attention.py
│   │       │   ├── test_linear.py
│   │       │   ├── test_mlp.py
│   │       │   └── test_moe.py
│   │       ├── ext_bindings.cpp
│   │       ├── operators/
│   │       │   ├── amx/
│   │       │   │   ├── la/
│   │       │   │   │   ├── amx.hpp
│   │       │   │   │   └── utils.hpp
│   │       │   │   └── moe.hpp
│   │       │   ├── kvcache/
│   │       │   │   ├── kvcache.h
│   │       │   │   ├── kvcache_attn.cpp
│   │       │   │   ├── kvcache_load_dump.cpp
│   │       │   │   ├── kvcache_read_write.cpp
│   │       │   │   └── kvcache_utils.cpp
│   │       │   └── llamafile/
│   │       │       ├── conversion.h
│   │       │       ├── linear.cpp
│   │       │       ├── linear.h
│   │       │       ├── mlp.cpp
│   │       │       ├── mlp.h
│   │       │       ├── moe.cpp
│   │       │       └── moe.h
│   │       └── vendors/
│   │           ├── cuda.h
│   │           ├── hip.h
│   │           ├── musa.h
│   │           └── vendor.h
│   ├── install-with-cache.sh
│   ├── install.bat
│   ├── install.sh
│   ├── ktransformers/
│   │   ├── __init__.py
│   │   ├── configs/
│   │   │   ├── config.yaml
│   │   │   └── log_config.ini
│   │   ├── ktransformers_ext/
│   │   │   ├── operators/
│   │   │   │   └── custom_marlin/
│   │   │   │       └── quantize/
│   │   │   │           └── utils/
│   │   │   │               ├── __init__.py
│   │   │   │               ├── format_24.py
│   │   │   │               ├── marlin_24_perms.py
│   │   │   │               ├── marlin_perms.py
│   │   │   │               ├── marlin_utils.py
│   │   │   │               └── quant_utils.py
│   │   │   └── triton/
│   │   │       └── fp8gemm.py
│   │   ├── local_chat.py
│   │   ├── local_chat_test.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── ascend/
│   │   │   │   ├── custom_ascend_modeling_deepseek_v3.py
│   │   │   │   └── custom_ascend_modeling_qwen3.py
│   │   │   ├── configuration_deepseek.py
│   │   │   ├── configuration_deepseek_v3.py
│   │   │   ├── configuration_glm4_moe.py
│   │   │   ├── configuration_llama.py
│   │   │   ├── configuration_qwen2_moe.py
│   │   │   ├── configuration_qwen3_moe.py
│   │   │   ├── configuration_qwen3_next.py
│   │   │   ├── configuration_smallthinker.py
│   │   │   ├── custom_cache.py
│   │   │   ├── custom_modeling_deepseek_v2.py
│   │   │   ├── custom_modeling_deepseek_v3.py
│   │   │   ├── custom_modeling_glm4_moe.py
│   │   │   ├── custom_modeling_qwen2_moe.py
│   │   │   ├── custom_modeling_qwen3_moe.py
│   │   │   ├── custom_modeling_qwen3_next.py
│   │   │   ├── custom_modeling_smallthinker.py
│   │   │   ├── modeling_deepseek.py
│   │   │   ├── modeling_deepseek_v3.py
│   │   │   ├── modeling_glm4_moe.py
│   │   │   ├── modeling_llama.py
│   │   │   ├── modeling_mixtral.py
│   │   │   ├── modeling_qwen2_moe.py
│   │   │   ├── modeling_qwen3_moe.py
│   │   │   ├── modeling_qwen3_next.py
│   │   │   └── modeling_smallthinker.py
│   │   ├── operators/
│   │   │   ├── RoPE.py
│   │   │   ├── __init__.py
│   │   │   ├── ascend/
│   │   │   │   ├── ascend_attention.py
│   │   │   │   ├── ascend_experts.py
│   │   │   │   ├── ascend_gate.py
│   │   │   │   ├── ascend_layernorm.py
│   │   │   │   ├── ascend_linear.py
│   │   │   │   └── ascend_mlp.py
│   │   │   ├── attention.py
│   │   │   ├── balance_serve_attention.py
│   │   │   ├── base_operator.py
│   │   │   ├── cpuinfer.py
│   │   │   ├── dynamic_attention.py
│   │   │   ├── experts.py
│   │   │   ├── flashinfer_batch_prefill_wrapper.py
│   │   │   ├── flashinfer_wrapper.py
│   │   │   ├── gate.py
│   │   │   ├── layernorm.py
│   │   │   ├── linear.py
│   │   │   ├── mlp.py
│   │   │   ├── models.py
│   │   │   ├── triton_attention.py
│   │   │   └── triton_attention_prefill.py
│   │   ├── optimize/
│   │   │   ├── optimize.py
│   │   │   └── optimize_rules/
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Chat.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-gpu-cpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat.yaml
│   │   │       ├── DeepSeek-V3-Chat-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-8.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-npu.yaml
│   │   │       ├── DeepSeek-V3-Chat-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat.yaml
│   │   │       ├── Glm4Moe-serve.yaml
│   │   │       ├── Internlm2_5-7b-Chat-1m.yaml
│   │   │       ├── Mixtral.yaml
│   │   │       ├── Moonlight-16B-A3B-serve.yaml
│   │   │       ├── Moonlight-16B-A3B.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct.yaml
│   │   │       ├── Qwen2-serve-amx.yaml
│   │   │       ├── Qwen2-serve.yaml
│   │   │       ├── Qwen3Moe-serve-amx.yaml
│   │   │       ├── Qwen3Moe-serve.yaml
│   │   │       ├── Qwen3Next-serve.yaml
│   │   │       ├── Smallthinker-serve.yaml
│   │   │       ├── npu/
│   │   │       │   ├── DeepSeek-V3-Chat-300IA2-npu-serve.yaml
│   │   │       │   ├── DeepSeek-V3-Chat-300IA2-npu.yaml
│   │   │       │   └── Qwen3-Chat-300IA2-npu-serve.yaml
│   │   │       ├── rocm/
│   │   │       │   └── DeepSeek-V3-Chat.yaml
│   │   │       └── xpu/
│   │   │           ├── DeepSeek-V2-Chat.yaml
│   │   │           ├── DeepSeek-V3-Chat.yaml
│   │   │           └── Qwen3Moe-Chat.yaml
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── api/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ollama/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── openai/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── assistants.py
│   │   │   │   │   │   ├── messages.py
│   │   │   │   │   │   ├── runs.py
│   │   │   │   │   │   └── threads.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── chat.py
│   │   │   │   │   └── legacy/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── completions.py
│   │   │   │   └── web/
│   │   │   │       ├── __init__.py
│   │   │   │       └── system.py
│   │   │   ├── args.py
│   │   │   ├── backend/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── base.py
│   │   │   │   ├── context_manager.py
│   │   │   │   └── interfaces/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── balance_serve.py
│   │   │   │       ├── exllamav2.py
│   │   │   │       ├── ktransformers.py
│   │   │   │       └── transformers.py
│   │   │   ├── balance_serve/
│   │   │   │   ├── inference/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── config.py
│   │   │   │   │   ├── distributed/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── communication_op.py
│   │   │   │   │   │   ├── cuda_wrapper.py
│   │   │   │   │   │   ├── custom_all_reduce.py
│   │   │   │   │   │   ├── custom_all_reduce_utils.py
│   │   │   │   │   │   ├── parallel_state.py
│   │   │   │   │   │   ├── pynccl.py
│   │   │   │   │   │   ├── pynccl_wrapper.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── forward_batch.py
│   │   │   │   │   ├── model_runner.py
│   │   │   │   │   ├── query_manager.py
│   │   │   │   │   └── sampling/
│   │   │   │   │       ├── penaltylib/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── orchestrator.py
│   │   │   │   │       │   └── penalizers/
│   │   │   │   │       │       ├── frequency_penalty.py
│   │   │   │   │       │       ├── min_new_tokens.py
│   │   │   │   │       │       ├── presence_penalty.py
│   │   │   │   │       │       └── repetition_penalty.py
│   │   │   │   │       └── sampler.py
│   │   │   │   ├── sched_rpc.py
│   │   │   │   └── settings.py
│   │   │   ├── config/
│   │   │   │   ├── config.py
│   │   │   │   ├── log.py
│   │   │   │   └── singleton.py
│   │   │   ├── crud/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── exceptions.py
│   │   │   ├── main.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── run_steps.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── requirements.txt
│   │   │   ├── schemas/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── assistants/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants.py
│   │   │   │   │   ├── messages.py
│   │   │   │   │   ├── runs.py
│   │   │   │   │   ├── streaming.py
│   │   │   │   │   ├── threads.py
│   │   │   │   │   └── tool.py
│   │   │   │   ├── base.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   └── chat.py
│   │   │   │   └── legacy/
│   │   │   │       ├── __init__.py
│   │   │   │       └── completions.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── create_interface.py
│   │   │       ├── multi_timer.py
│   │   │       ├── serve_profiling.py
│   │   │       └── sql_utils.py
│   │   ├── tests/
│   │   │   ├── .gitignore
│   │   │   ├── AIME_2024/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── UT/
│   │   │   │   ├── test_kdeepseek_attention_w8a8a2serve_npu.py
│   │   │   │   └── test_kdeepseek_ln_npu.py
│   │   │   ├── dequant_gpu.py
│   │   │   ├── dequant_gpu_t.py
│   │   │   ├── function_call_test.py
│   │   │   ├── humaneval/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── mmlu_pro_test.py
│   │   │   ├── mmlu_test.py
│   │   │   ├── mmlu_test_multi.py
│   │   │   ├── parse_cover_info.py
│   │   │   ├── score.py
│   │   │   ├── test_client.py
│   │   │   ├── test_prefix.py
│   │   │   ├── test_pytorch_q8.py
│   │   │   ├── test_speed.py
│   │   │   └── triton_fp8gemm_test.py
│   │   ├── util/
│   │   │   ├── ascend/
│   │   │   │   └── ascend_utils.py
│   │   │   ├── cuda_graph_runner.py
│   │   │   ├── custom_gguf.py
│   │   │   ├── custom_loader.py
│   │   │   ├── modeling_rope_utils.py
│   │   │   ├── npu_graph_runner.py
│   │   │   ├── textstream.py
│   │   │   ├── utils.py
│   │   │   ├── vendors.py
│   │   │   └── weight_loader.py
│   │   └── website/
│   │       ├── .browserslistrc
│   │       ├── .eslintrc.js
│   │       ├── .gitignore
│   │       ├── README.md
│   │       ├── config.d.ts
│   │       ├── jest.config.js
│   │       ├── package.json
│   │       ├── public/
│   │       │   ├── config.js
│   │       │   ├── css/
│   │       │   │   └── reset.css
│   │       │   └── index.html
│   │       ├── src/
│   │       │   ├── App.vue
│   │       │   ├── api/
│   │       │   │   ├── api-client.ts
│   │       │   │   ├── assistant.ts
│   │       │   │   ├── message.ts
│   │       │   │   ├── run.ts
│   │       │   │   └── thread.ts
│   │       │   ├── assets/
│   │       │   │   ├── css/
│   │       │   │   │   └── mixins.styl
│   │       │   │   └── iconfont/
│   │       │   │       ├── demo.css
│   │       │   │       ├── demo_index.html
│   │       │   │       ├── iconfont.css
│   │       │   │       ├── iconfont.js
│   │       │   │       └── iconfont.json
│   │       │   ├── components/
│   │       │   │   └── chat/
│   │       │   │       └── index.vue
│   │       │   ├── conf/
│   │       │   │   └── config.ts
│   │       │   ├── locals/
│   │       │   │   ├── en.js
│   │       │   │   ├── index.js
│   │       │   │   └── zh.js
│   │       │   ├── main.ts
│   │       │   ├── router/
│   │       │   │   └── index.ts
│   │       │   ├── shims-vue.d.ts
│   │       │   ├── store/
│   │       │   │   └── index.ts
│   │       │   ├── utils/
│   │       │   │   ├── copy.ts
│   │       │   │   └── types.ts
│   │       │   └── views/
│   │       │       └── home.vue
│   │       ├── tests/
│   │       │   └── unit/
│   │       │       └── example.spec.ts
│   │       ├── tsconfig.json
│   │       └── vue.config.js
│   ├── merge_tensors/
│   │   ├── merge_safetensor_gguf.py
│   │   └── merge_safetensor_gguf_for_qwen3.py
│   ├── pyproject.toml
│   ├── requirements-local_chat.txt
│   ├── setup.py
│   └── third_party/
│       ├── llamafile/
│       │   ├── README.md
│       │   ├── bench.h
│       │   ├── flags.cpp
│       │   ├── flags.h
│       │   ├── iqk_mul_mat.inc
│       │   ├── iqk_mul_mat_amd_avx2.cpp
│       │   ├── iqk_mul_mat_amd_zen4.cpp
│       │   ├── iqk_mul_mat_arm.inc
│       │   ├── iqk_mul_mat_arm82.cpp
│       │   ├── iqk_mul_mat_x86.inc
│       │   ├── macros.h
│       │   ├── micros.h
│       │   ├── numba.h
│       │   ├── sgemm.cpp
│       │   ├── sgemm.h
│       │   ├── sgemm_arm.cpp
│       │   ├── sgemm_x86.cpp
│       │   ├── tinyblas_cpu.h
│       │   ├── tinyblas_cpu_mixmul.inc
│       │   ├── tinyblas_cpu_mixmul_amd_avx.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avx2.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avx512f.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_fma.cpp
│       │   ├── tinyblas_cpu_mixmul_amd_zen4.cpp
│       │   ├── tinyblas_cpu_mixmul_arm80.cpp
│       │   ├── tinyblas_cpu_mixmul_arm82.cpp
│       │   ├── tinyblas_cpu_sgemm.inc
│       │   ├── tinyblas_cpu_sgemm_amd_avx.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avx2.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avx512f.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_fma.cpp
│       │   ├── tinyblas_cpu_sgemm_amd_zen4.cpp
│       │   ├── tinyblas_cpu_sgemm_arm.inc
│       │   ├── tinyblas_cpu_sgemm_arm80.cpp
│       │   ├── tinyblas_cpu_sgemm_arm82.cpp
│       │   ├── tinyblas_cpu_sgemm_x86.inc
│       │   └── tinyblas_cpu_unsupported.cpp
│       └── nlohmann/
│           ├── json.hpp
│           └── json_fwd.hpp
├── book.toml
├── doc/
│   ├── SUMMARY.md
│   ├── basic/
│   │   ├── note1.md
│   │   └── note2.md
│   ├── en/
│   │   ├── AMX.md
│   │   ├── DeepseekR1_V3_tutorial.md
│   │   ├── Docker.md
│   │   ├── Docker_xpu.md
│   │   ├── FAQ.md
│   │   ├── Kimi-K2-Thinking.md
│   │   ├── Kimi-K2.5.md
│   │   ├── Kimi-K2.md
│   │   ├── Kllama_tutorial_DeepSeekV2Lite.ipynb
│   │   ├── MiniMax-M2.5.md
│   │   ├── Qwen3-Next.md
│   │   ├── Qwen3.5.md
│   │   ├── ROCm.md
│   │   ├── SFT/
│   │   │   ├── DPO_tutorial.md
│   │   │   ├── KTransformers-Fine-Tuning_Developer-Technical-Notes.md
│   │   │   ├── KTransformers-Fine-Tuning_User-Guide.md
│   │   │   ├── README.md
│   │   │   └── injection_tutorial.md
│   │   ├── SFT_Installation_Guide_KimiK2.5.md
│   │   ├── SFT_Installation_Guide_KimiK2.md
│   │   ├── SmallThinker_and_Glm4moe.md
│   │   ├── V3-success.md
│   │   ├── api/
│   │   │   └── server/
│   │   │       ├── api.md
│   │   │       ├── server.md
│   │   │       ├── tabby.md
│   │   │       └── website.md
│   │   ├── balance-serve.md
│   │   ├── benchmark.md
│   │   ├── deepseek-v2-injection.md
│   │   ├── fp8_kernel.md
│   │   ├── install.md
│   │   ├── kt-kernel/
│   │   │   ├── GLM-5-Tutorial.md
│   │   │   ├── Kimi-K2-Thinking-Native.md
│   │   │   ├── MiniMax-M2.1-Tutorial.md
│   │   │   ├── Native-Precision-Tutorial.md
│   │   │   ├── Qwen3-Coder-Next-Tutorial.md
│   │   │   ├── README.md
│   │   │   ├── amd_blis.md
│   │   │   ├── deepseek-v3.2-sglang-tutorial.md
│   │   │   ├── experts-sched-Tutorial.md
│   │   │   └── kt-cli.md
│   │   ├── llama4.md
│   │   ├── long_context_introduction.md
│   │   ├── long_context_tutorial.md
│   │   ├── makefile_usage.md
│   │   ├── multi-gpu-tutorial.md
│   │   ├── operators/
│   │   │   └── llamafile.md
│   │   ├── prefix_cache.md
│   │   └── xpu.md
│   └── zh/
│       ├── DeepseekR1_V3_tutorial_zh.md
│       ├── DeepseekR1_V3_tutorial_zh_for_Ascend_NPU.md
│       ├── KTransformers-Fine-Tuning_Developer-Technical-Notes_zh.md
│       ├── KTransformers-Fine-Tuning_User-Guide_zh.md
│       ├── Qwen3-MoE_tutorial_zh_for_Ascend_NPU.md
│       ├── api/
│       │   └── server/
│       │       ├── api.md
│       │       ├── server.md
│       │       ├── tabby.md
│       │       └── website.md
│       └── clawdbot_integration_guide.md
├── docker/
│   ├── Dockerfile
│   ├── README-packaging.md
│   ├── docker-utils.sh
│   └── push-to-dockerhub.sh
├── install.sh
├── kt-kernel/
│   ├── .clang-format
│   ├── .githooks/
│   │   ├── commit-msg
│   │   └── pre-commit
│   ├── .gitignore
│   ├── .gitmodules
│   ├── CMakeLists.txt
│   ├── CMakePresets.json
│   ├── MANIFEST.in
│   ├── README.md
│   ├── README_zh.md
│   ├── bench/
│   │   ├── .gitignore
│   │   ├── Makefile
│   │   ├── bench_attention.py
│   │   ├── bench_attention_torch.py
│   │   ├── bench_bf16_moe.py
│   │   ├── bench_fp8_moe.py
│   │   ├── bench_fp8_perchannel_moe.py
│   │   ├── bench_k2_moe_amx.py
│   │   ├── bench_k2_write_buffer.py
│   │   ├── bench_linear.py
│   │   ├── bench_linear_torch.py
│   │   ├── bench_mla.py
│   │   ├── bench_mlp.py
│   │   ├── bench_mlp_torch.py
│   │   ├── bench_moe.py
│   │   ├── bench_moe_amx.py
│   │   ├── bench_moe_amx_k.py
│   │   ├── bench_moe_kernel.py
│   │   ├── bench_moe_kernel_tiling.py
│   │   ├── bench_moe_kml.py
│   │   ├── bench_moe_torch.py
│   │   ├── bench_write_buffer.py
│   │   ├── compare_moe_performance.py
│   │   ├── multi_bench_moe.py
│   │   └── upload-bench-json.py
│   ├── cmake/
│   │   ├── DetectCPU.cmake
│   │   └── FindSIMD.cmake
│   ├── cpu_backend/
│   │   ├── cpuinfer.h
│   │   ├── shared_mem_buffer.cpp
│   │   ├── shared_mem_buffer.h
│   │   ├── task_queue.cpp
│   │   ├── task_queue.h
│   │   ├── vendors/
│   │   │   ├── README.md
│   │   │   ├── cuda.h
│   │   │   ├── hip.h
│   │   │   ├── musa.h
│   │   │   └── vendor.h
│   │   ├── worker_pool.cpp
│   │   └── worker_pool.h
│   ├── cuda/
│   │   ├── binding.cpp
│   │   ├── custom_gguf/
│   │   │   ├── dequant.cu
│   │   │   └── ops.h
│   │   ├── gptq_marlin/
│   │   │   ├── gptq_marlin.cu
│   │   │   ├── gptq_marlin.cuh
│   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   └── ops.h
│   │   ├── moe/
│   │   │   ├── moe_topk_softmax_kernels.cu
│   │   │   ├── ops.h
│   │   │   └── utils.h
│   │   ├── setup.py
│   │   └── test_dequant.py
│   ├── demo/
│   │   ├── .gitignore
│   │   ├── Makefile
│   │   ├── bench_reorder_bandwidth.cpp
│   │   ├── bf16-test.cpp
│   │   ├── fp16-test.cpp
│   │   ├── plot.py
│   │   ├── simple_test.cpp
│   │   ├── simple_test_aocl.cpp
│   │   └── tflops.py
│   ├── examples/
│   │   ├── .gitignore
│   │   ├── bench_moe_amx_int8.py
│   │   ├── configuration_deepseek_v3.py
│   │   ├── modeling_deepseek_v3.py
│   │   ├── repro_llamafile_re.py
│   │   ├── test-debug.py
│   │   ├── test_apply_rope.py
│   │   ├── test_attention.py
│   │   ├── test_awq_moe_amx.py
│   │   ├── test_bf16_moe.py
│   │   ├── test_deepseekv3.py
│   │   ├── test_deepseekv3_prefill.py
│   │   ├── test_deepseekv3_prefill_speed.py
│   │   ├── test_fp8_moe.py
│   │   ├── test_fp8_perchannel_moe.py
│   │   ├── test_gate.py
│   │   ├── test_k2_moe_amx.py
│   │   ├── test_k2_write_buffer.py
│   │   ├── test_linear.py
│   │   ├── test_mla.py
│   │   ├── test_mla_qlen.py
│   │   ├── test_mla_quant.py
│   │   ├── test_mla_simple.py
│   │   ├── test_mla_torch.py
│   │   ├── test_mlp.py
│   │   ├── test_moe.py
│   │   ├── test_moe_amx.py
│   │   ├── test_moe_kernel.py
│   │   ├── test_moe_kml.py
│   │   ├── test_rope.cpp
│   │   ├── test_rope.py
│   │   ├── test_softmax.py
│   │   ├── test_write_buffer.py
│   │   └── torch_attention.py
│   ├── ext_bindings.cpp
│   ├── install.sh
│   ├── operators/
│   │   ├── amx/
│   │   │   ├── awq-moe.hpp
│   │   │   ├── bf16-moe.hpp
│   │   │   ├── fp8-moe.hpp
│   │   │   ├── fp8-perchannel-moe.hpp
│   │   │   ├── k2-moe.hpp
│   │   │   ├── la/
│   │   │   │   ├── amx-example.cpp
│   │   │   │   ├── amx.hpp
│   │   │   │   ├── amx_buffers.hpp
│   │   │   │   ├── amx_config.hpp
│   │   │   │   ├── amx_kernels.hpp
│   │   │   │   ├── amx_quantization.hpp
│   │   │   │   ├── amx_raw_buffers.hpp
│   │   │   │   ├── amx_raw_kernels.hpp
│   │   │   │   ├── amx_utils.hpp
│   │   │   │   ├── pack.hpp
│   │   │   │   └── utils.hpp
│   │   │   ├── moe.hpp
│   │   │   ├── moe_base.hpp
│   │   │   └── test/
│   │   │       ├── amx-bkgroup-test.cpp
│   │   │       ├── amx-c-reduce-test.cpp
│   │   │       ├── amx-kgroup-test.cpp
│   │   │       ├── amx-test.cpp
│   │   │       ├── analyze-error.cpp
│   │   │       ├── avx-test.cpp
│   │   │       ├── debug-kgroup-details.cpp
│   │   │       ├── debug-kgroup.cpp
│   │   │       ├── debug-specific-dims.cpp
│   │   │       ├── mat-test.hpp
│   │   │       ├── mmq-test.cpp
│   │   │       ├── mmq.cpp
│   │   │       ├── mmq.h
│   │   │       ├── test-kgroup-128.cpp
│   │   │       ├── test-kgroup-kernel.cpp
│   │   │       ├── test-specific-dims.cpp
│   │   │       ├── thread_test.sh
│   │   │       ├── timer.hh
│   │   │       └── verify-kgroup.cpp
│   │   ├── common.hpp
│   │   ├── kvcache/
│   │   │   ├── kvcache.h
│   │   │   ├── kvcache_attn.cpp
│   │   │   ├── kvcache_load_dump.cpp
│   │   │   ├── kvcache_read_write.cpp
│   │   │   └── kvcache_utils.cpp
│   │   ├── llamafile/
│   │   │   ├── conversion.h
│   │   │   ├── linear.cpp
│   │   │   ├── linear.h
│   │   │   ├── mla.hpp
│   │   │   ├── mlp.cpp
│   │   │   ├── mlp.h
│   │   │   └── moe.hpp
│   │   ├── mla-tp.hpp
│   │   ├── moe-tp.hpp
│   │   ├── moe_kernel/
│   │   │   ├── api/
│   │   │   │   ├── common.h
│   │   │   │   └── mat_kernel.h
│   │   │   ├── la/
│   │   │   │   ├── kernel.hpp
│   │   │   │   ├── mat_kernel.cpp
│   │   │   │   └── utils.hpp
│   │   │   ├── mat_kernel/
│   │   │   │   ├── aocl_kernel/
│   │   │   │   │   └── kernel.cpp
│   │   │   │   └── batch_gemm_api.hpp
│   │   │   ├── moe.hpp
│   │   │   └── test/
│   │   │       ├── convert-test.cpp
│   │   │       ├── debug.hpp
│   │   │       ├── int4_mul-test.cpp
│   │   │       ├── mat_test.cpp
│   │   │       └── utils_test.cpp
│   │   ├── reduce.hpp
│   │   ├── rms-norm.hpp
│   │   ├── rope.hpp
│   │   ├── softmax.hpp
│   │   └── tp.hpp
│   ├── pyproject.toml
│   ├── pytest.ini
│   ├── python/
│   │   ├── __init__.py
│   │   ├── _cpu_detect.py
│   │   ├── cli/
│   │   │   ├── __init__.py
│   │   │   ├── commands/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── bench.py
│   │   │   │   ├── chat.py
│   │   │   │   ├── config.py
│   │   │   │   ├── doctor.py
│   │   │   │   ├── model.py
│   │   │   │   ├── quant.py
│   │   │   │   ├── run.py
│   │   │   │   ├── sft.py
│   │   │   │   └── version.py
│   │   │   ├── completions/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── _kt
│   │   │   │   ├── kt-completion.bash
│   │   │   │   └── kt.fish
│   │   │   ├── config/
│   │   │   │   ├── __init__.py
│   │   │   │   └── settings.py
│   │   │   ├── i18n.py
│   │   │   ├── main.py
│   │   │   ├── requirements/
│   │   │   │   ├── inference.txt
│   │   │   │   └── sft.txt
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── analyze_moe_model.py
│   │   │       ├── console.py
│   │   │       ├── debug_configs.py
│   │   │       ├── download_helper.py
│   │   │       ├── environment.py
│   │   │       ├── input_validators.py
│   │   │       ├── kv_cache_calculator.py
│   │   │       ├── model_discovery.py
│   │   │       ├── model_registry.py
│   │   │       ├── model_scanner.py
│   │   │       ├── model_table_builder.py
│   │   │       ├── model_verifier.py
│   │   │       ├── port_checker.py
│   │   │       ├── quant_interactive.py
│   │   │       ├── repo_detector.py
│   │   │       ├── run_configs.py
│   │   │       ├── run_interactive.py
│   │   │       ├── sglang_checker.py
│   │   │       ├── tuna_engine.py
│   │   │       └── user_model_registry.py
│   │   ├── experts.py
│   │   ├── experts_base.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── amx.py
│   │       ├── llamafile.py
│   │       ├── loader.py
│   │       └── moe_kernel.py
│   ├── requirements.txt
│   ├── scripts/
│   │   ├── README.md
│   │   ├── check.py
│   │   ├── check_cpu_features.py
│   │   ├── compare_weights.py
│   │   ├── convert_cpu_weights.py
│   │   ├── convert_gpu_weights.py
│   │   ├── convert_kimi_k2_fp8_to_bf16_cpu.py
│   │   ├── convert_moe_to_bf16.py
│   │   └── install-git-hooks.sh
│   ├── setup.py
│   └── test/
│       ├── __init__.py
│       ├── ci/
│       │   ├── __init__.py
│       │   ├── ci_register.py
│       │   └── ci_utils.py
│       ├── per_commit/
│       │   ├── __init__.py
│       │   ├── test_amd_placeholder.py
│       │   ├── test_basic_cpu.py
│       │   ├── test_cuda_placeholder.py
│       │   ├── test_moe_amx_accuracy_int4.py
│       │   ├── test_moe_amx_accuracy_int4_1.py
│       │   ├── test_moe_amx_accuracy_int4_1k.py
│       │   ├── test_moe_amx_accuracy_int8.py
│       │   ├── test_moe_amx_bench_int4.py
│       │   ├── test_moe_amx_bench_int4_1.py
│       │   ├── test_moe_amx_bench_int4_1k.py
│       │   └── test_moe_amx_bench_int8.py
│       ├── run_suite.py
│       └── test_generate_gpu_experts_masks.py
├── kt-sft/
│   ├── .flake8
│   ├── .gitignore
│   ├── .gitmodules
│   ├── .pylintrc
│   ├── Dockerfile
│   ├── Dockerfile.xpu
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── Makefile
│   ├── README.md
│   ├── SECURITY.md
│   ├── autosetup.sh
│   ├── book.toml
│   ├── csrc/
│   │   ├── custom_marlin/
│   │   │   ├── __init__.py
│   │   │   ├── binding.cpp
│   │   │   ├── gptq_marlin/
│   │   │   │   ├── gptq_marlin.cu
│   │   │   │   ├── gptq_marlin.cuh
│   │   │   │   ├── gptq_marlin_dtypes.cuh
│   │   │   │   ├── gptq_marlin_repack.cu
│   │   │   │   └── ops.h
│   │   │   ├── setup.py
│   │   │   ├── test_cuda_graph.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── format24.py
│   │   │       ├── marlin_24_perms.py
│   │   │       ├── marlin_perms.py
│   │   │       ├── marlin_utils.py
│   │   │       └── quant_utils.py
│   │   └── ktransformers_ext/
│   │       ├── CMakeLists.txt
│   │       ├── bench/
│   │       │   ├── bench_attention.py
│   │       │   ├── bench_attention_torch.py
│   │       │   ├── bench_linear.py
│   │       │   ├── bench_linear_torch.py
│   │       │   ├── bench_mlp.py
│   │       │   ├── bench_mlp_torch.py
│   │       │   ├── bench_moe.py
│   │       │   ├── bench_moe_amx.py
│   │       │   └── bench_moe_torch.py
│   │       ├── cmake/
│   │       │   └── FindSIMD.cmake
│   │       ├── cpu_backend/
│   │       │   ├── backend.cpp
│   │       │   ├── backend.h
│   │       │   ├── cpuinfer.h
│   │       │   ├── shared_mem_buffer.cpp
│   │       │   ├── shared_mem_buffer.h
│   │       │   ├── task_queue.cpp
│   │       │   ├── task_queue.h
│   │       │   └── vendors/
│   │       │       ├── README.md
│   │       │       ├── cuda.h
│   │       │       ├── hip.h
│   │       │       ├── musa.h
│   │       │       └── vendor.h
│   │       ├── cuda/
│   │       │   ├── binding.cpp
│   │       │   ├── custom_gguf/
│   │       │   │   ├── dequant.cu
│   │       │   │   └── ops.h
│   │       │   ├── gptq_marlin/
│   │       │   │   ├── gptq_marlin.cu
│   │       │   │   ├── gptq_marlin.cuh
│   │       │   │   ├── gptq_marlin_dtypes.cuh
│   │       │   │   └── ops.h
│   │       │   ├── setup.py
│   │       │   └── test_dequant.py
│   │       ├── examples/
│   │       │   ├── test_attention.py
│   │       │   ├── test_linear.py
│   │       │   ├── test_mlp.py
│   │       │   ├── test_moe.py
│   │       │   ├── test_sft_amx_moe.py
│   │       │   └── test_sft_moe.py
│   │       ├── ext_bindings.cpp
│   │       ├── operators/
│   │       │   ├── amx/
│   │       │   │   ├── debug_sft_moe.hpp
│   │       │   │   ├── debug_tools_sft_moe.hpp
│   │       │   │   ├── la/
│   │       │   │   │   ├── amx.hpp
│   │       │   │   │   └── utils.hpp
│   │       │   │   ├── moe.hpp
│   │       │   │   └── sft_moe.hpp
│   │       │   ├── kvcache/
│   │       │   │   ├── kvcache.h
│   │       │   │   ├── kvcache_attn.cpp
│   │       │   │   ├── kvcache_load_dump.cpp
│   │       │   │   ├── kvcache_read_write.cpp
│   │       │   │   └── kvcache_utils.cpp
│   │       │   └── llamafile/
│   │       │       ├── conversion.h
│   │       │       ├── linear.cpp
│   │       │       ├── linear.h
│   │       │       ├── mlp.cpp
│   │       │       ├── mlp.h
│   │       │       ├── moe.cpp
│   │       │       ├── moe.h
│   │       │       ├── sft_moe.cpp
│   │       │       ├── sft_moe.h
│   │       │       └── sft_moe_forward_cache.h
│   │       └── vendors/
│   │           ├── cuda.h
│   │           ├── hip.h
│   │           ├── musa.h
│   │           └── vendor.h
│   ├── install-with-cache.sh
│   ├── install.bat
│   ├── install.sh
│   ├── ktransformers/
│   │   ├── __init__.py
│   │   ├── configs/
│   │   │   ├── config.yaml
│   │   │   ├── log_config.ini
│   │   │   └── model_config/
│   │   │       ├── config.json
│   │   │       └── configuration_deepseek.py
│   │   ├── ktransformers_ext/
│   │   │   ├── operators/
│   │   │   │   └── custom_marlin/
│   │   │   │       └── quantize/
│   │   │   │           └── utils/
│   │   │   │               ├── __init__.py
│   │   │   │               ├── format_24.py
│   │   │   │               ├── marlin_24_perms.py
│   │   │   │               ├── marlin_perms.py
│   │   │   │               ├── marlin_utils.py
│   │   │   │               └── quant_utils.py
│   │   │   └── triton/
│   │   │       └── fp8gemm.py
│   │   ├── local_chat.py
│   │   ├── local_chat.sh
│   │   ├── lora_test_module.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── configuration_deepseek.py
│   │   │   ├── configuration_deepseek_v3.py
│   │   │   ├── configuration_llama.py
│   │   │   ├── configuration_qwen2_moe.py
│   │   │   ├── configuration_qwen3_moe.py
│   │   │   ├── custom_cache.py
│   │   │   ├── custom_modeling_deepseek_v2.py
│   │   │   ├── custom_modeling_deepseek_v3.py
│   │   │   ├── custom_modeling_qwen2_moe.py
│   │   │   ├── custom_modeling_qwen3_moe.py
│   │   │   ├── modeling_deepseek.py
│   │   │   ├── modeling_deepseek_v3.py
│   │   │   ├── modeling_llama.py
│   │   │   ├── modeling_mixtral.py
│   │   │   ├── modeling_qwen2_moe.py
│   │   │   └── modeling_qwen3_moe.py
│   │   ├── moe_test_module.py
│   │   ├── moe_test_module_old.py
│   │   ├── operators/
│   │   │   ├── RoPE.py
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── balance_serve_attention.py
│   │   │   ├── base_operator.py
│   │   │   ├── cpuinfer.py
│   │   │   ├── dynamic_attention.py
│   │   │   ├── experts.py
│   │   │   ├── flashinfer_batch_prefill_wrapper.py
│   │   │   ├── flashinfer_wrapper.py
│   │   │   ├── gate.py
│   │   │   ├── layernorm.py
│   │   │   ├── linear.py
│   │   │   ├── mlp.py
│   │   │   ├── models.py
│   │   │   ├── triton_attention.py
│   │   │   └── triton_attention_prefill.py
│   │   ├── optimize/
│   │   │   ├── optimize.py
│   │   │   └── optimize_rules/
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V2-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V2-Chat.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft-amx-multi-gpu.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-sft.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat-use-adapter.yaml
│   │   │       ├── DeepSeek-V2-Lite-Chat.yaml
│   │   │       ├── DeepSeek-V3-Chat-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-8.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-fp8-linear-ggml-experts.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu-marlin.yaml
│   │   │       ├── DeepSeek-V3-Chat-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-serve.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx-multi-gpu-4.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx-multi-gpu.yaml
│   │   │       ├── DeepSeek-V3-Chat-sft-amx.yaml
│   │   │       ├── DeepSeek-V3-Chat.yaml
│   │   │       ├── Internlm2_5-7b-Chat-1m.yaml
│   │   │       ├── Mixtral.yaml
│   │   │       ├── Moonlight-16B-A3B-serve.yaml
│   │   │       ├── Moonlight-16B-A3B.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct-multi-gpu.yaml
│   │   │       ├── Qwen2-57B-A14B-Instruct.yaml
│   │   │       ├── Qwen2-serve-amx.yaml
│   │   │       ├── Qwen2-serve.yaml
│   │   │       ├── Qwen3Moe-serve-amx.yaml
│   │   │       ├── Qwen3Moe-serve.yaml
│   │   │       ├── Qwen3Moe-sft-amx.yaml
│   │   │       ├── rocm/
│   │   │       │   └── DeepSeek-V3-Chat.yaml
│   │   │       └── xpu/
│   │   │           ├── DeepSeek-V2-Chat.yaml
│   │   │           ├── DeepSeek-V3-Chat.yaml
│   │   │           └── Qwen3Moe-Chat.yaml
│   │   ├── server/
│   │   │   ├── __init__.py
│   │   │   ├── api/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── ollama/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── completions.py
│   │   │   │   ├── openai/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── assistants.py
│   │   │   │   │   │   ├── messages.py
│   │   │   │   │   │   ├── runs.py
│   │   │   │   │   │   └── threads.py
│   │   │   │   │   ├── endpoints/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── chat.py
│   │   │   │   │   └── legacy/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── completions.py
│   │   │   │   └── web/
│   │   │   │       ├── __init__.py
│   │   │   │       └── system.py
│   │   │   ├── args.py
│   │   │   ├── backend/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── base.py
│   │   │   │   ├── context_manager.py
│   │   │   │   └── interfaces/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── balance_serve.py
│   │   │   │       ├── exllamav2.py
│   │   │   │       ├── ktransformers.py
│   │   │   │       └── transformers.py
│   │   │   ├── balance_serve/
│   │   │   │   ├── inference/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── config.py
│   │   │   │   │   ├── distributed/
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   ├── communication_op.py
│   │   │   │   │   │   ├── cuda_wrapper.py
│   │   │   │   │   │   ├── custom_all_reduce.py
│   │   │   │   │   │   ├── custom_all_reduce_utils.py
│   │   │   │   │   │   ├── parallel_state.py
│   │   │   │   │   │   ├── pynccl.py
│   │   │   │   │   │   ├── pynccl_wrapper.py
│   │   │   │   │   │   └── utils.py
│   │   │   │   │   ├── forward_batch.py
│   │   │   │   │   ├── model_runner.py
│   │   │   │   │   ├── query_manager.py
│   │   │   │   │   └── sampling/
│   │   │   │   │       ├── penaltylib/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── orchestrator.py
│   │   │   │   │       │   └── penalizers/
│   │   │   │   │       │       ├── frequency_penalty.py
│   │   │   │   │       │       ├── min_new_tokens.py
│   │   │   │   │       │       ├── presence_penalty.py
│   │   │   │   │       │       └── repetition_penalty.py
│   │   │   │   │       └── sampler.py
│   │   │   │   ├── sched_rpc.py
│   │   │   │   └── settings.py
│   │   │   ├── config/
│   │   │   │   ├── config.py
│   │   │   │   ├── log.py
│   │   │   │   └── singleton.py
│   │   │   ├── crud/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── exceptions.py
│   │   │   ├── main.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   └── assistants/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── assistants.py
│   │   │   │       ├── messages.py
│   │   │   │       ├── run_steps.py
│   │   │   │       ├── runs.py
│   │   │   │       └── threads.py
│   │   │   ├── schemas/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── assistants/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── assistants.py
│   │   │   │   │   ├── messages.py
│   │   │   │   │   ├── runs.py
│   │   │   │   │   ├── streaming.py
│   │   │   │   │   ├── threads.py
│   │   │   │   │   └── tool.py
│   │   │   │   ├── base.py
│   │   │   │   ├── conversation.py
│   │   │   │   ├── endpoints/
│   │   │   │   │   └── chat.py
│   │   │   │   └── legacy/
│   │   │   │       ├── __init__.py
│   │   │   │       └── completions.py
│   │   │   └── utils/
│   │   │       ├── __init__.py
│   │   │       ├── create_interface.py
│   │   │       ├── multi_timer.py
│   │   │       └── sql_utils.py
│   │   ├── sft/
│   │   │   ├── __init__.py
│   │   │   ├── flops_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── custom_profile.py
│   │   │   │   └── lora_test_utils.py
│   │   │   ├── lora.py
│   │   │   ├── metrics.py
│   │   │   ├── metrics_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── constants.py
│   │   │   │   ├── env.py
│   │   │   │   ├── logging.py
│   │   │   │   ├── misc.py
│   │   │   │   ├── packages.py
│   │   │   │   └── ploting.py
│   │   │   ├── monkey_patch_torch_module.py
│   │   │   ├── peft_utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lora_layer.py
│   │   │   │   ├── lora_model.py
│   │   │   │   ├── mapping.py
│   │   │   │   └── peft_model.py
│   │   │   └── torchviz_test.py
│   │   ├── tests/
│   │   │   ├── .gitignore
│   │   │   ├── AIME_2024/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── dequant_gpu.py
│   │   │   ├── dequant_gpu_t.py
│   │   │   ├── function_call_test.py
│   │   │   ├── humaneval/
│   │   │   │   ├── eval_api.py
│   │   │   │   ├── evaluation.py
│   │   │   │   └── prompts.py
│   │   │   ├── mmlu_pro_test.py
│   │   │   ├── mmlu_test.py
│   │   │   ├── mmlu_test_multi.py
│   │   │   ├── score.py
│   │   │   ├── test_client.py
│   │   │   ├── test_pytorch_q8.py
│   │   │   ├── test_speed.py
│   │   │   └── triton_fp8gemm_test.py
│   │   ├── util/
│   │   │   ├── cuda_graph_runner.py
│   │   │   ├── custom_gguf.py
│   │   │   ├── custom_loader.py
│   │   │   ├── globals.py
│   │   │   ├── grad_wrapper.py
│   │   │   ├── inference_state.py
│   │   │   ├── modeling_rope_utils.py
│   │   │   ├── textstream.py
│   │   │   ├── utils.py
│   │   │   ├── vendors.py
│   │   │   └── weight_loader.py
│   │   └── website/
│   │       ├── .browserslistrc
│   │       ├── .eslintrc.js
│   │       ├── .gitignore
│   │       ├── README.md
│   │       ├── config.d.ts
│   │       ├── jest.config.js
│   │       ├── package.json
│   │       ├── public/
│   │       │   ├── config.js
│   │       │   ├── css/
│   │       │   │   └── reset.css
│   │       │   └── index.html
│   │       ├── src/
│   │       │   ├── App.vue
│   │       │   ├── api/
│   │       │   │   ├── api-client.ts
│   │       │   │   ├── assistant.ts
│   │       │   │   ├── message.ts
│   │       │   │   ├── run.ts
│   │       │   │   └── thread.ts
│   │       │   ├── assets/
│   │       │   │   ├── css/
│   │       │   │   │   └── mixins.styl
│   │       │   │   └── iconfont/
│   │       │   │       ├── demo.css
│   │       │   │       ├── demo_index.html
│   │       │   │       ├── iconfont.css
│   │       │   │       ├── iconfont.js
│   │       │   │       └── iconfont.json
│   │       │   ├── components/
│   │       │   │   └── chat/
│   │       │   │       └── index.vue
│   │       │   ├── conf/
│   │       │   │   └── config.ts
│   │       │   ├── locals/
│   │       │   │   ├── en.js
│   │       │   │   ├── index.js
│   │       │   │   └── zh.js
│   │       │   ├── main.ts
│   │       │   ├── router/
│   │       │   │   └── index.ts
│   │       │   ├── shims-vue.d.ts
│   │       │   ├── store/
│   │       │   │   └── index.ts
│   │       │   ├── utils/
│   │       │   │   ├── copy.ts
│   │       │   │   └── types.ts
│   │       │   └── views/
│   │       │       └── home.vue
│   │       ├── tests/
│   │       │   └── unit/
│   │       │       └── example.spec.ts
│   │       ├── tsconfig.json
│   │       └── vue.config.js
│   ├── merge_tensors/
│   │   └── merge_safetensor_gguf.py
│   ├── pyproject.toml
│   ├── requirements-sft.txt
│   ├── setup.py
│   ├── test_adapter/
│   │   ├── data_transfer.py
│   │   ├── infer_with_adapter.py
│   │   ├── inspect_adapter.py
│   │   ├── pred2metrics.py
│   │   ├── test_grad.py
│   │   └── time_test_lora_train.py
│   └── withoutKT_PEFT.py
├── pyproject.toml
├── setup.py
├── third_party/
│   └── llamafile/
│       ├── README.md
│       ├── bench.h
│       ├── flags.cpp
│       ├── flags.h
│       ├── iqk_mul_mat.inc
│       ├── iqk_mul_mat_amd_avx2.cpp
│       ├── iqk_mul_mat_amd_zen4.cpp
│       ├── iqk_mul_mat_arm.inc
│       ├── iqk_mul_mat_arm82.cpp
│       ├── macros.h
│       ├── micros.h
│       ├── numba.h
│       ├── sgemm.cpp
│       ├── sgemm.h
│       ├── tinyblas_cpu.h
│       ├── tinyblas_cpu_mixmul.inc
│       ├── tinyblas_cpu_mixmul_amd_avx.cpp
│       ├── tinyblas_cpu_mixmul_amd_avx2.cpp
│       ├── tinyblas_cpu_mixmul_amd_avx512f.cpp
│       ├── tinyblas_cpu_mixmul_amd_avxvnni.cpp
│       ├── tinyblas_cpu_mixmul_amd_fma.cpp
│       ├── tinyblas_cpu_mixmul_amd_zen4.cpp
│       ├── tinyblas_cpu_mixmul_arm80.cpp
│       ├── tinyblas_cpu_mixmul_arm82.cpp
│       ├── tinyblas_cpu_sgemm.inc
│       ├── tinyblas_cpu_sgemm_amd_avx.cpp
│       ├── tinyblas_cpu_sgemm_amd_avx2.cpp
│       ├── tinyblas_cpu_sgemm_amd_avx512f.cpp
│       ├── tinyblas_cpu_sgemm_amd_avxvnni.cpp
│       ├── tinyblas_cpu_sgemm_amd_fma.cpp
│       ├── tinyblas_cpu_sgemm_amd_zen4.cpp
│       ├── tinyblas_cpu_sgemm_arm80.cpp
│       ├── tinyblas_cpu_sgemm_arm82.cpp
│       └── tinyblas_cpu_unsupported.cpp
└── version.py